From 14790eeae359971a5045f399c0bd0fe549e10c94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 24 Feb 2019 19:44:24 +0100 Subject: [PATCH 01/44] chore: Move index related things to the meilidb-core workspace member --- Cargo.toml | 60 +- meilidb-core/Cargo.toml | 21 + {src => meilidb-core/src}/automaton.rs | 0 .../src}/criterion/document_id.rs | 5 +- .../src}/criterion/exact.rs | 6 +- .../src}/criterion/mod.rs | 6 +- .../src}/criterion/number_of_words.rs | 6 +- .../src}/criterion/sort_by_attr.rs | 4 +- .../src}/criterion/sum_of_typos.rs | 4 +- .../src}/criterion/sum_of_words_attribute.rs | 6 +- .../src}/criterion/sum_of_words_position.rs | 6 +- .../src}/criterion/words_proximity.rs | 6 +- {src => meilidb-core/src}/data/doc_ids.rs | 0 {src => meilidb-core/src}/data/doc_indexes.rs | 0 {src => meilidb-core/src}/data/mod.rs | 0 {src => meilidb-core/src}/data/shared_data.rs | 0 .../rank => meilidb-core/src}/distinct_map.rs | 0 {src/database => meilidb-core/src}/index.rs | 0 src/rank/mod.rs => meilidb-core/src/lib.rs | 118 +- .../src}/query_builder.rs | 22 +- .../src}/shared_data_cursor.rs | 0 {src => meilidb-core/src}/write_to_bytes.rs | 0 meilidb/Cargo.lock | 1072 +++++++++++++++++ meilidb/Cargo.toml | 50 + {src => meilidb/src}/common_words.rs | 0 {src => meilidb/src}/database/config.rs | 0 {src => meilidb/src}/database/document_key.rs | 2 +- {src => meilidb/src}/database/mod.rs | 8 +- {src => meilidb/src}/database/number.rs | 0 {src => meilidb/src}/database/schema.rs | 2 +- .../src}/database/serde/deserializer.rs | 2 +- .../src}/database/serde/find_id.rs | 2 +- .../src}/database/serde/indexer_serializer.rs | 3 +- .../src}/database/serde/key_to_string.rs | 0 {src => meilidb/src}/database/serde/mod.rs | 0 .../src}/database/serde/serializer.rs | 2 +- .../src}/database/serde/value_to_number.rs | 0 .../src}/database/update/index_event.rs | 6 +- {src => meilidb/src}/database/update/mod.rs | 7 +- .../src}/database/update/ranked_map_event.rs | 6 +- {src => meilidb/src}/database/view.rs | 5 +- meilidb/src/lib.rs | 22 + {src => meilidb/src}/tokenizer/mod.rs | 0 src/lib.rs | 136 --- 44 files changed, 1343 insertions(+), 252 deletions(-) create mode 100644 meilidb-core/Cargo.toml rename {src => meilidb-core/src}/automaton.rs (100%) rename {src/rank => meilidb-core/src}/criterion/document_id.rs (76%) rename {src/rank => meilidb-core/src}/criterion/exact.rs (92%) rename {src/rank => meilidb-core/src}/criterion/mod.rs (97%) rename {src/rank => meilidb-core/src}/criterion/number_of_words.rs (89%) rename {src/rank => meilidb-core/src}/criterion/sort_by_attr.rs (98%) rename {src/rank => meilidb-core/src}/criterion/sum_of_typos.rs (97%) rename {src/rank => meilidb-core/src}/criterion/sum_of_words_attribute.rs (92%) rename {src/rank => meilidb-core/src}/criterion/sum_of_words_position.rs (93%) rename {src/rank => meilidb-core/src}/criterion/words_proximity.rs (98%) rename {src => meilidb-core/src}/data/doc_ids.rs (100%) rename {src => meilidb-core/src}/data/doc_indexes.rs (100%) rename {src => meilidb-core/src}/data/mod.rs (100%) rename {src => meilidb-core/src}/data/shared_data.rs (100%) rename {src/rank => meilidb-core/src}/distinct_map.rs (100%) rename {src/database => meilidb-core/src}/index.rs (100%) rename src/rank/mod.rs => meilidb-core/src/lib.rs (62%) rename {src/rank => meilidb-core/src}/query_builder.rs (94%) rename {src => meilidb-core/src}/shared_data_cursor.rs (100%) rename {src => meilidb-core/src}/write_to_bytes.rs (100%) create mode 100644 meilidb/Cargo.lock create mode 100644 meilidb/Cargo.toml rename {src => meilidb/src}/common_words.rs (100%) rename {src => meilidb/src}/database/config.rs (100%) rename {src => meilidb/src}/database/document_key.rs (99%) rename {src => meilidb/src}/database/mod.rs (99%) rename {src => meilidb/src}/database/number.rs (100%) rename {src => meilidb/src}/database/schema.rs (99%) rename {src => meilidb/src}/database/serde/deserializer.rs (99%) rename {src => meilidb/src}/database/serde/find_id.rs (99%) rename {src => meilidb/src}/database/serde/indexer_serializer.rs (99%) rename {src => meilidb/src}/database/serde/key_to_string.rs (100%) rename {src => meilidb/src}/database/serde/mod.rs (100%) rename {src => meilidb/src}/database/serde/serializer.rs (99%) rename {src => meilidb/src}/database/serde/value_to_number.rs (100%) rename {src => meilidb/src}/database/update/index_event.rs (90%) rename {src => meilidb/src}/database/update/mod.rs (98%) rename {src => meilidb/src}/database/update/ranked_map_event.rs (90%) rename {src => meilidb/src}/database/view.rs (98%) create mode 100644 meilidb/src/lib.rs rename {src => meilidb/src}/tokenizer/mod.rs (100%) delete mode 100644 src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index dd994020d..df9c871ba 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,55 +1,5 @@ -[package] -edition = "2018" -name = "meilidb" -version = "0.3.2" -authors = ["Kerollmops "] - -[dependencies] -arc-swap = "0.3.7" -bincode = "1.1.2" -byteorder = "1.3.1" -fst = "0.3.3" -hashbrown = { version = "0.1.8", features = ["serde"] } -lazy_static = "1.2.0" -levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } -linked-hash-map = { version = "0.5.1", features = ["serde_impl"] } -lockfree = "0.5.1" -log = "0.4.6" -rayon = "1.0.3" -sdset = "0.3.1" -serde = "1.0.88" -serde_derive = "1.0.88" -serde_json = { version = "1.0.38", features = ["preserve_order"] } -size_format = "1.0.2" -slice-group-by = "0.2.4" -unidecode = "0.3.0" - -[dependencies.toml] -git = "https://github.com/Kerollmops/toml-rs.git" -features = ["preserve_order"] -rev = "0372ba6" - -[dependencies.rocksdb] -git = "https://github.com/pingcap/rust-rocksdb.git" -rev = "306e201" - -[features] -default = ["simd"] -i128 = ["bincode/i128", "byteorder/i128"] -portable = ["rocksdb/portable"] -simd = ["rocksdb/sse"] -nightly = ["hashbrown/nightly", "slice-group-by/nightly"] - -[dev-dependencies] -csv = "1.0.5" -env_logger = "0.6.0" -jemallocator = "0.1.9" -quickcheck = "0.8.2" -rand = "0.6.5" -rand_xorshift = "0.1.1" -structopt = "0.2.14" -tempfile = "3.0.7" -termcolor = "1.0.4" - -[profile.release] -debug = true +[workspace] +members = [ + "meilidb", + "meilidb-core", +] diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml new file mode 100644 index 000000000..5523a1331 --- /dev/null +++ b/meilidb-core/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "meilidb-core" +version = "0.1.0" +authors = ["Kerollmops "] +edition = "2018" + +[dependencies] +byteorder = "1.3.1" +fst = "0.3.3" +hashbrown = "0.1.8" +lazy_static = "1.2.0" +levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } +log = "0.4.6" +rayon = "1.0.3" +sdset = "0.3.1" +serde = "1.0.88" +serde_derive = "1.0.88" +slice-group-by = "0.2.4" + +[features] +i128 = ["byteorder/i128"] diff --git a/src/automaton.rs b/meilidb-core/src/automaton.rs similarity index 100% rename from src/automaton.rs rename to meilidb-core/src/automaton.rs diff --git a/src/rank/criterion/document_id.rs b/meilidb-core/src/criterion/document_id.rs similarity index 76% rename from src/rank/criterion/document_id.rs rename to meilidb-core/src/criterion/document_id.rs index 8e4cf91b5..27025a2da 100644 --- a/src/rank/criterion/document_id.rs +++ b/meilidb-core/src/criterion/document_id.rs @@ -1,7 +1,6 @@ use std::cmp::Ordering; - -use crate::rank::criterion::Criterion; -use crate::rank::RawDocument; +use crate::criterion::Criterion; +use crate::RawDocument; #[derive(Debug, Clone, Copy)] pub struct DocumentId; diff --git a/src/rank/criterion/exact.rs b/meilidb-core/src/criterion/exact.rs similarity index 92% rename from src/rank/criterion/exact.rs rename to meilidb-core/src/criterion/exact.rs index 6933aaff5..b76e9ace5 100644 --- a/src/rank/criterion/exact.rs +++ b/meilidb-core/src/criterion/exact.rs @@ -1,9 +1,7 @@ use std::cmp::Ordering; - use slice_group_by::GroupBy; - -use crate::rank::criterion::Criterion; -use crate::rank::RawDocument; +use crate::criterion::Criterion; +use crate::RawDocument; #[inline] fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize { diff --git a/src/rank/criterion/mod.rs b/meilidb-core/src/criterion/mod.rs similarity index 97% rename from src/rank/criterion/mod.rs rename to meilidb-core/src/criterion/mod.rs index 78c1bff5a..3e2fd5028 100644 --- a/src/rank/criterion/mod.rs +++ b/meilidb-core/src/criterion/mod.rs @@ -4,11 +4,11 @@ mod words_proximity; mod sum_of_words_attribute; mod sum_of_words_position; mod exact; -mod sort_by_attr; +// mod sort_by_attr; mod document_id; use std::cmp::Ordering; -use crate::rank::RawDocument; +use crate::RawDocument; pub use self::{ sum_of_typos::SumOfTypos, @@ -17,7 +17,7 @@ pub use self::{ sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition, exact::Exact, - sort_by_attr::SortByAttr, + // sort_by_attr::SortByAttr, document_id::DocumentId, }; diff --git a/src/rank/criterion/number_of_words.rs b/meilidb-core/src/criterion/number_of_words.rs similarity index 89% rename from src/rank/criterion/number_of_words.rs rename to meilidb-core/src/criterion/number_of_words.rs index 0c6f5a200..798123e6a 100644 --- a/src/rank/criterion/number_of_words.rs +++ b/meilidb-core/src/criterion/number_of_words.rs @@ -1,9 +1,7 @@ use std::cmp::Ordering; - use slice_group_by::GroupBy; - -use crate::rank::criterion::Criterion; -use crate::rank::RawDocument; +use crate::criterion::Criterion; +use crate::RawDocument; #[inline] fn number_of_query_words(query_index: &[u32]) -> usize { diff --git a/src/rank/criterion/sort_by_attr.rs b/meilidb-core/src/criterion/sort_by_attr.rs similarity index 98% rename from src/rank/criterion/sort_by_attr.rs rename to meilidb-core/src/criterion/sort_by_attr.rs index 05033a1e1..8b7b23fa6 100644 --- a/src/rank/criterion/sort_by_attr.rs +++ b/meilidb-core/src/criterion/sort_by_attr.rs @@ -3,9 +3,9 @@ use std::error::Error; use std::fmt; use crate::database::schema::{Schema, SchemaAttr}; -use crate::rank::criterion::Criterion; +use crate::criterion::Criterion; use crate::database::RankedMap; -use crate::rank::RawDocument; +use crate::RawDocument; /// An helper struct that permit to sort documents by /// some of their stored attributes. diff --git a/src/rank/criterion/sum_of_typos.rs b/meilidb-core/src/criterion/sum_of_typos.rs similarity index 97% rename from src/rank/criterion/sum_of_typos.rs rename to meilidb-core/src/criterion/sum_of_typos.rs index bbffec870..714766a20 100644 --- a/src/rank/criterion/sum_of_typos.rs +++ b/meilidb-core/src/criterion/sum_of_typos.rs @@ -2,8 +2,8 @@ use std::cmp::Ordering; use slice_group_by::GroupBy; -use crate::rank::criterion::Criterion; -use crate::rank::RawDocument; +use crate::criterion::Criterion; +use crate::RawDocument; // This function is a wrong logarithmic 10 function. // It is safe to panic on input number higher than 3, diff --git a/src/rank/criterion/sum_of_words_attribute.rs b/meilidb-core/src/criterion/sum_of_words_attribute.rs similarity index 92% rename from src/rank/criterion/sum_of_words_attribute.rs rename to meilidb-core/src/criterion/sum_of_words_attribute.rs index 0a5303490..a46787797 100644 --- a/src/rank/criterion/sum_of_words_attribute.rs +++ b/meilidb-core/src/criterion/sum_of_words_attribute.rs @@ -1,9 +1,7 @@ use std::cmp::Ordering; - use slice_group_by::GroupBy; - -use crate::rank::criterion::Criterion; -use crate::rank::RawDocument; +use crate::criterion::Criterion; +use crate::RawDocument; #[inline] fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { diff --git a/src/rank/criterion/sum_of_words_position.rs b/meilidb-core/src/criterion/sum_of_words_position.rs similarity index 93% rename from src/rank/criterion/sum_of_words_position.rs rename to meilidb-core/src/criterion/sum_of_words_position.rs index 5938ce5ab..86f4e93fa 100644 --- a/src/rank/criterion/sum_of_words_position.rs +++ b/meilidb-core/src/criterion/sum_of_words_position.rs @@ -1,9 +1,7 @@ use std::cmp::Ordering; - use slice_group_by::GroupBy; - -use crate::rank::criterion::Criterion; -use crate::rank::RawDocument; +use crate::criterion::Criterion; +use crate::RawDocument; #[inline] fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize { diff --git a/src/rank/criterion/words_proximity.rs b/meilidb-core/src/criterion/words_proximity.rs similarity index 98% rename from src/rank/criterion/words_proximity.rs rename to meilidb-core/src/criterion/words_proximity.rs index dbf26e21a..fc6c8bb31 100644 --- a/src/rank/criterion/words_proximity.rs +++ b/meilidb-core/src/criterion/words_proximity.rs @@ -1,9 +1,7 @@ use std::cmp::{self, Ordering}; - use slice_group_by::GroupBy; - -use crate::rank::criterion::Criterion; -use crate::rank::RawDocument; +use crate::criterion::Criterion; +use crate::RawDocument; const MAX_DISTANCE: u16 = 8; diff --git a/src/data/doc_ids.rs b/meilidb-core/src/data/doc_ids.rs similarity index 100% rename from src/data/doc_ids.rs rename to meilidb-core/src/data/doc_ids.rs diff --git a/src/data/doc_indexes.rs b/meilidb-core/src/data/doc_indexes.rs similarity index 100% rename from src/data/doc_indexes.rs rename to meilidb-core/src/data/doc_indexes.rs diff --git a/src/data/mod.rs b/meilidb-core/src/data/mod.rs similarity index 100% rename from src/data/mod.rs rename to meilidb-core/src/data/mod.rs diff --git a/src/data/shared_data.rs b/meilidb-core/src/data/shared_data.rs similarity index 100% rename from src/data/shared_data.rs rename to meilidb-core/src/data/shared_data.rs diff --git a/src/rank/distinct_map.rs b/meilidb-core/src/distinct_map.rs similarity index 100% rename from src/rank/distinct_map.rs rename to meilidb-core/src/distinct_map.rs diff --git a/src/database/index.rs b/meilidb-core/src/index.rs similarity index 100% rename from src/database/index.rs rename to meilidb-core/src/index.rs diff --git a/src/rank/mod.rs b/meilidb-core/src/lib.rs similarity index 62% rename from src/rank/mod.rs rename to meilidb-core/src/lib.rs index f5b07d27d..7266aa87d 100644 --- a/src/rank/mod.rs +++ b/meilidb-core/src/lib.rs @@ -1,16 +1,118 @@ pub mod criterion; +pub mod data; +mod index; +mod automaton; mod query_builder; mod distinct_map; +pub mod shared_data_cursor; +pub mod write_to_bytes; + use std::sync::Arc; +use serde_derive::{Serialize, Deserialize}; use slice_group_by::GroupBy; use rayon::slice::ParallelSliceMut; -use crate::{Match, DocumentId}; - +pub use self::index::{Index, IndexBuilder}; pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder}; +/// Represent an internally generated document unique identifier. +/// +/// It is used to inform the database the document you want to deserialize. +/// Helpful for custom ranking. +#[derive(Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub struct DocumentId(pub u64); + +/// This structure represent the position of a word +/// in a document and its attributes. +/// +/// This is stored in the map, generated at index time, +/// extracted and interpreted at search time. +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[repr(C)] +pub struct DocIndex { + /// The document identifier where the word was found. + pub document_id: DocumentId, + + /// The attribute in the document where the word was found + /// along with the index in it. + pub attribute: u16, + pub word_index: u16, + + /// The position in bytes where the word was found + /// along with the length of it. + /// + /// It informs on the original word area in the text indexed + /// without needing to run the tokenizer again. + pub char_index: u16, + pub char_length: u16, +} + +/// This structure represent a matching word with informations +/// on the location of the word in the document. +/// +/// The order of the field is important because it defines +/// the way these structures are ordered between themselves. +/// +/// The word in itself is not important. +// TODO do data oriented programming ? very arrays ? +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Match { + /// The word index in the query sentence. + /// Same as the `attribute_index` but for the query words. + /// + /// Used to retrieve the automaton that match this word. + pub query_index: u32, + + /// The distance the word has with the query word + /// (i.e. the Levenshtein distance). + pub distance: u8, + + /// The attribute in the document where the word was found + /// along with the index in it. + pub attribute: u16, + pub word_index: u16, + + /// Whether the word that match is an exact match or a prefix. + pub is_exact: bool, + + /// The position in bytes where the word was found + /// along with the length of it. + /// + /// It informs on the original word area in the text indexed + /// without needing to run the tokenizer again. + pub char_index: u16, + pub char_length: u16, +} + +impl Match { + pub fn zero() -> Self { + Match { + query_index: 0, + distance: 0, + attribute: 0, + word_index: 0, + is_exact: false, + char_index: 0, + char_length: 0, + } + } + + pub fn max() -> Self { + Match { + query_index: u32::max_value(), + distance: u8::max_value(), + attribute: u16::max_value(), + word_index: u16::max_value(), + is_exact: true, + char_index: u16::max_value(), + char_length: u16::max_value(), + } + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Document { pub id: DocumentId, @@ -181,3 +283,15 @@ impl Matches { } } } + + +#[cfg(test)] +mod tests { + use super::*; + use std::mem; + + #[test] + fn docindex_mem_size() { + assert_eq!(mem::size_of::(), 24); + } +} diff --git a/src/rank/query_builder.rs b/meilidb-core/src/query_builder.rs similarity index 94% rename from src/rank/query_builder.rs rename to meilidb-core/src/query_builder.rs index 6b145b493..f462a52e6 100644 --- a/src/rank/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -11,11 +11,23 @@ use fst::Streamer; use log::info; use crate::automaton::{self, DfaExt, AutomatonExt}; -use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap}; -use crate::rank::criterion::Criteria; -use crate::database::Index; -use crate::rank::{raw_documents_from_matches, RawDocument, Document}; -use crate::{is_cjk, Match, DocumentId}; +use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; +use crate::criterion::Criteria; +use crate::{raw_documents_from_matches, RawDocument, Document}; +use crate::{Index, Match, DocumentId}; + +// query splitting must move out of this crate +pub fn is_cjk(c: char) -> bool { + (c >= '\u{2e80}' && c <= '\u{2eff}') || + (c >= '\u{2f00}' && c <= '\u{2fdf}') || + (c >= '\u{3040}' && c <= '\u{309f}') || + (c >= '\u{30a0}' && c <= '\u{30ff}') || + (c >= '\u{3100}' && c <= '\u{312f}') || + (c >= '\u{3200}' && c <= '\u{32ff}') || + (c >= '\u{3400}' && c <= '\u{4dbf}') || + (c >= '\u{4e00}' && c <= '\u{9fff}') || + (c >= '\u{f900}' && c <= '\u{faff}') +} #[derive(Debug, PartialEq, Eq)] enum CharCategory { diff --git a/src/shared_data_cursor.rs b/meilidb-core/src/shared_data_cursor.rs similarity index 100% rename from src/shared_data_cursor.rs rename to meilidb-core/src/shared_data_cursor.rs diff --git a/src/write_to_bytes.rs b/meilidb-core/src/write_to_bytes.rs similarity index 100% rename from src/write_to_bytes.rs rename to meilidb-core/src/write_to_bytes.rs diff --git a/meilidb/Cargo.lock b/meilidb/Cargo.lock new file mode 100644 index 000000000..1a32c8b9e --- /dev/null +++ b/meilidb/Cargo.lock @@ -0,0 +1,1072 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "aho-corasick" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "arc-swap" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "arrayvec" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "atty" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", + "termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "autocfg" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "bincode" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "bitflags" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "build_const" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "byteorder" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "bzip2-sys" +version = "0.1.7" +source = "git+https://github.com/alexcrichton/bzip2-rs.git#18fd3e18bc1763219a7496e466a16bd213448fec" +dependencies = [ + "cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "cc" +version = "1.0.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rayon 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "cfg-if" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "clap" +version = "2.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", + "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", + "textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", + "vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "cloudabi" +version = "0.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "cmake" +version = "0.1.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crc" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-deque" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-epoch 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "arrayvec 0.4.10 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-utils" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "csv" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "csv-core 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "csv-core" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "either" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "env_logger" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "humantime 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "termcolor 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "fs_extra" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "fst" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "fuchsia-cprng" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "generic-array" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "typenum 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "glob" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "hashbrown" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "heck" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "humantime" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "indexmap" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "itoa" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "jemalloc-sys" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)", + "fs_extra 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "jemallocator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "jemalloc-sys 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "lazy_static" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "levenshtein_automata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "fst 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "libc" +version = "0.2.49" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "librocksdb_sys" +version = "0.1.0" +source = "git+https://github.com/pingcap/rust-rocksdb.git?rev=306e201#306e2010429873a1d1d979b70f0d30e437dddc6c" +dependencies = [ + "bzip2-sys 0.1.7 (git+https://github.com/alexcrichton/bzip2-rs.git)", + "cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)", + "cmake 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", + "libz-sys 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", + "lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)", + "snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)", + "zstd-sys 1.4.9+zstd.1.3.8 (git+https://github.com/gyscos/zstd-rs.git)", +] + +[[package]] +name = "libz-sys" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", + "pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)", + "vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "linked-hash-map" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_test 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "lockfree" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "owned-alloc 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "log" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "lz4-sys" +version = "1.8.0" +source = "git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build#41509fea212e9ca55c1f6c53d4fd1ddf28cdf689" +dependencies = [ + "cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "meilidb" +version = "0.3.1" +dependencies = [ + "arc-swap 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", + "bincode 1.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "csv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", + "env_logger 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", + "fst 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "hashbrown 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", + "jemallocator 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "linked-hash-map 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "lockfree 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "quickcheck 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "rayon 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", + "rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git?rev=306e201)", + "sdset 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.38 (registry+https://github.com/rust-lang/crates.io-index)", + "size_format 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", + "slice-group-by 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)", + "structopt 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)", + "tempfile 3.0.7 (registry+https://github.com/rust-lang/crates.io-index)", + "termcolor 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "toml 0.5.0 (git+https://github.com/Kerollmops/toml-rs.git?rev=0372ba6)", + "unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "memchr" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "memmap" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "memoffset" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "nodrop" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "num" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num-complex 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "num-integer 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)", + "num-iter 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)", + "num-rational 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-complex" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-integer" +version = "0.1.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-iter" +version = "0.1.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num-integer 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)", + "num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-rational" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num-integer 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)", + "num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-traits" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "num_cpus" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "owned-alloc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "pkg-config" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "proc-macro2" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "quick-error" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "quickcheck" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "env_logger 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "quote" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_chacha 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_hc 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_isaac 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_jitter 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_os 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_pcg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_chacha" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_core" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_core" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "rand_hc" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_isaac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_jitter" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_os" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)", + "fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_pcg" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand_xorshift" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rayon" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-deque 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "either 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "rayon-core 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rayon-core" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-deque 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", + "num_cpus 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rdrand" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "redox_syscall" +version = "0.1.51" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "redox_termios" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex-syntax" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "remove_dir_all" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rocksdb" +version = "0.3.0" +source = "git+https://github.com/pingcap/rust-rocksdb.git?rev=306e201#306e2010429873a1d1d979b70f0d30e437dddc6c" +dependencies = [ + "crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", + "librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git?rev=306e201)", +] + +[[package]] +name = "ryu" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "scopeguard" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "sdset" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "serde" +version = "1.0.88" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "serde_derive" +version = "1.0.88" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.15.26 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "serde_json" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", + "itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", + "ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "serde_test" +version = "1.0.88" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "size_format" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "generic-array 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)", + "num 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "slice-group-by" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "snappy-sys" +version = "0.1.0" +source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#be02178330bb17648d6ac605af249eba18b32b71" +dependencies = [ + "cmake 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", + "pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "strsim" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "structopt" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", + "structopt-derive 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "structopt-derive" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "heck 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.15.26 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "syn" +version = "0.15.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "tempfile" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)", + "remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "termcolor" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "wincolor 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "termion" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)", + "redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "textwrap" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "thread_local" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "toml" +version = "0.5.0" +source = "git+https://github.com/Kerollmops/toml-rs.git?rev=0372ba6#0372ba6925aa2c6db4d27022562064e25cdc5312" +dependencies = [ + "linked-hash-map 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "typenum" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "ucd-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicode-segmentation" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicode-width" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicode-xid" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unidecode" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "utf8-ranges" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "vcpkg" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "vec_map" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "winapi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "winapi-util" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "wincolor" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "zstd-sys" +version = "1.4.9+zstd.1.3.8" +source = "git+https://github.com/gyscos/zstd-rs.git#d51f87c668932670b9aced48d1b750506c211f11" +dependencies = [ + "cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)", + "glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[metadata] +"checksum aho-corasick 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "81ce3d38065e618af2d7b77e10c5ad9a069859b4be3c2250f674af3840d9c8a5" +"checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +"checksum arc-swap 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "1025aeae2b664ca0ea726a89d574fe8f4e77dd712d443236ad1de00379450cf6" +"checksum arrayvec 0.4.10 (registry+https://github.com/rust-lang/crates.io-index)" = "92c7fb76bc8826a8b33b4ee5bb07a247a81e76764ab4d55e8f73e3a4d8808c71" +"checksum atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "9a7d5b8723950951411ee34d271d99dddcc2035a16ab25310ea2c8cfd4369652" +"checksum autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a6d640bee2da49f60a4068a7fae53acde8982514ab7bae8b8cea9e88cbcfd799" +"checksum bincode 1.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3efe0b4c8eaeed8600549c29f538a6a11bf422858d0ed435b1d70ec4ab101190" +"checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" +"checksum build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39092a32794787acd8525ee150305ff051b0aa6cc2abaf193924f5ab05425f39" +"checksum byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a019b10a2a7cdeb292db131fc8113e57ea2a908f6e7894b0c3c671893b65dbeb" +"checksum bzip2-sys 0.1.7 (git+https://github.com/alexcrichton/bzip2-rs.git)" = "" +"checksum cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)" = "4390a3b5f4f6bce9c1d0c00128379df433e53777fdd30e92f16a529332baec4e" +"checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4" +"checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e" +"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +"checksum cmake 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)" = "6ec65ee4f9c9d16f335091d23693457ed4928657ba4982289d7fafee03bc614a" +"checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb" +"checksum crossbeam-deque 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f739f8c5363aca78cfb059edf753d8f0d36908c348f3d8d1503f03d8b75d9cf3" +"checksum crossbeam-epoch 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "927121f5407de9956180ff5e936fe3cf4324279280001cd56b669d28ee7e9150" +"checksum crossbeam-utils 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "2760899e32a1d58d5abb31129f8fae5de75220bc2176e77ff7c627ae45c918d9" +"checksum csv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "9fd1c44c58078cfbeaf11fbb3eac9ae5534c23004ed770cc4bfb48e658ae4f04" +"checksum csv-core 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fa5cdef62f37e6ffe7d1f07a381bc0db32b7a3ff1cac0de56cb0d81e71f53d65" +"checksum either 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c67353c641dc847124ea1902d69bd753dee9bb3beff9aa3662ecf86c971d1fac" +"checksum env_logger 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "afb070faf94c85d17d50ca44f6ad076bce18ae92f0037d350947240a36e9d42e" +"checksum fs_extra 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5f2a4a2034423744d2cc7ca2068453168dcdb82c438419e639a26bd87839c674" +"checksum fst 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "db72126ca7dff566cdbbdd54af44668c544897d9d3862b198141f176f1238bdf" +"checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" +"checksum generic-array 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3c0f28c2f5bfb5960175af447a2da7c18900693738343dc896ffbcabd9839592" +"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" +"checksum hashbrown 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "3bae29b6653b3412c2e71e9d486db9f9df5d701941d86683005efb9f2d28e3da" +"checksum heck 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205" +"checksum humantime 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3ca7e5f2e110db35f93b837c81797f3714500b81d517bf20c431b16d3ca4f114" +"checksum indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7e81a7c05f79578dbc15793d8b619db9ba32b4577003ef3af1a91c416798c58d" +"checksum itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1306f3464951f30e30d12373d31c79fbd52d236e5e896fd92f96ec7babbbe60b" +"checksum jemalloc-sys 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "bfc62c8e50e381768ce8ee0428ee53741929f7ebd73e4d83f669bcf7693e00ae" +"checksum jemallocator 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "9f0cd42ac65f758063fea55126b0148b1ce0a6354ff78e07a4d6806bc65c4ab3" +"checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1" +"checksum levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "73a004f877f468548d8d0ac4977456a249d8fabbdb8416c36db163dfc8f2e8ca" +"checksum libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)" = "413f3dfc802c5dc91dc570b05125b6cda9855edfaa9825c9849807876376e70e" +"checksum librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git?rev=306e201)" = "" +"checksum libz-sys 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)" = "2eb5e43362e38e2bca2fd5f5134c4d4564a23a5c28e9b95411652021a8675ebe" +"checksum linked-hash-map 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "70fb39025bc7cdd76305867c4eccf2f2dcf6e9a57f5b21a93e1c2d86cd03ec9e" +"checksum lockfree 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "74ee94b5ad113c7cb98c5a040f783d0952ee4fe100993881d1673c2cb002dd23" +"checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6" +"checksum lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)" = "" +"checksum memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2efc7bc57c883d4a4d6e3246905283d8dae951bb3bd32f49d6ef297f546e1c39" +"checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff" +"checksum memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0f9dc261e2b62d7a622bf416ea3c5245cdd5d9a7fcc428c0d06804dfce1775b3" +"checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" +"checksum num 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "cf4825417e1e1406b3782a8ce92f4d53f26ec055e3622e1881ca8e9f5f9e08db" +"checksum num-complex 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "107b9be86cd2481930688277b675b0114578227f034674726605b8a482d8baf8" +"checksum num-integer 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)" = "e83d528d2677f0518c570baf2b7abdcf0cd2d248860b68507bdcb3e91d4c0cea" +"checksum num-iter 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)" = "af3fdbbc3291a5464dc57b03860ec37ca6bf915ed6ee385e7c6c052c422b2124" +"checksum num-rational 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4e96f040177bb3da242b5b1ecf3f54b5d5af3efbbfb18608977a5d2767b22f10" +"checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1" +"checksum num_cpus 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1a23f0ed30a54abaa0c7e83b1d2d87ada7c3c23078d1d87815af3e3b6385fbba" +"checksum owned-alloc 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "30fceb411f9a12ff9222c5f824026be368ff15dc2f13468d850c7d3f502205d6" +"checksum pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "676e8eb2b1b4c9043511a9b7bea0915320d7e502b0a079fb03f9635a5252b18c" +"checksum proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)" = "4d317f9caece796be1980837fd5cb3dfec5613ebdb04ad0956deea83ce168915" +"checksum quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9274b940887ce9addde99c4eee6b5c44cc494b182b97e73dc8ffdcb3397fd3f0" +"checksum quickcheck 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3568ae5409428feef71bf062778bf5acfadc3d496b7696afa829f9eef70e17dc" +"checksum quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)" = "cdd8e04bd9c52e0342b406469d494fcb033be4bdbe5c606016defbb1681411e1" +"checksum rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" +"checksum rand_chacha 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" +"checksum rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" +"checksum rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d0e7a549d590831370895ab7ba4ea0c1b6b011d106b5ff2da6eee112615e6dc0" +"checksum rand_hc 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4" +"checksum rand_isaac 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08" +"checksum rand_jitter 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "7b9ea758282efe12823e0d952ddb269d2e1897227e464919a554f2a03ef1b832" +"checksum rand_os 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b7c690732391ae0abafced5015ffb53656abfaec61b342290e5eb56b286a679d" +"checksum rand_pcg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44" +"checksum rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" +"checksum rayon 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "373814f27745b2686b350dd261bfd24576a6fb0e2c5919b3a2b6005f820b0473" +"checksum rayon-core 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b055d1e92aba6877574d8fe604a63c8b5df60f60e5982bf7ccbb1338ea527356" +"checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" +"checksum redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)" = "423e376fffca3dfa06c9e9790a9ccd282fafb3cc6e6397d01dbf64f9bacc6b85" +"checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" +"checksum regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "37e7cbbd370869ce2e8dff25c7018702d10b21a20ef7135316f8daecd6c25b7f" +"checksum regex-syntax 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "8c2f35eedad5295fdf00a63d7d4b238135723f92b434ec06774dad15c7ab0861" +"checksum remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3488ba1b9a2084d38645c4c08276a1752dcbf2c7130d74f1569681ad5d2799c5" +"checksum rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git?rev=306e201)" = "" +"checksum ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "eb9e9b8cde282a9fe6a42dd4681319bfb63f121b8a8ee9439c6f4107e58a46f7" +"checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27" +"checksum sdset 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "876890e4982cfbf82aa77cf73df0c31812a912fb89fd454e02ef21ba5d3cac3b" +"checksum serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)" = "9f301d728f2b94c9a7691c90f07b0b4e8a4517181d9461be94c04bddeb4bd850" +"checksum serde_derive 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)" = "beed18e6f5175aef3ba670e57c60ef3b1b74d250d962a26604bff4c80e970dd4" +"checksum serde_json 1.0.38 (registry+https://github.com/rust-lang/crates.io-index)" = "27dce848e7467aa0e2fcaf0a413641499c0b745452aaca1194d24dedde9e13c9" +"checksum serde_test 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)" = "edb44ae54ee0ddf787ad6a5f4769cd61967cafe8ed4ef1b5189c10af73f689e2" +"checksum size_format 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6ed5f6ab2122c6dec69dca18c72fa4590a27e581ad20d44960fe74c032a0b23b" +"checksum slice-group-by 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "049599674ed27c9b78b93265482068999c0fc71116e186ea4a408e9fc47723b0" +"checksum snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)" = "" +"checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" +"checksum structopt 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)" = "670ad348dc73012fcf78c71f06f9d942232cdd4c859d4b6975e27836c3efc0c3" +"checksum structopt-derive 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)" = "ef98172b1a00b0bec738508d3726540edcbd186d50dfd326f2b1febbb3559f04" +"checksum syn 0.15.26 (registry+https://github.com/rust-lang/crates.io-index)" = "f92e629aa1d9c827b2bb8297046c1ccffc57c99b947a680d3ccff1f136a3bee9" +"checksum tempfile 3.0.7 (registry+https://github.com/rust-lang/crates.io-index)" = "b86c784c88d98c801132806dadd3819ed29d8600836c4088e855cdf3e178ed8a" +"checksum termcolor 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4096add70612622289f2fdcdbd5086dc81c1e2675e6ae58d6c4f62a16c6d7f2f" +"checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096" +"checksum textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "307686869c93e71f94da64286f9a9524c0f308a9e1c87a583de8e9c9039ad3f6" +"checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" +"checksum toml 0.5.0 (git+https://github.com/Kerollmops/toml-rs.git?rev=0372ba6)" = "" +"checksum typenum 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "612d636f949607bdf9b123b4a6f6d966dedf3ff669f7f045890d3a4a73948169" +"checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86" +"checksum unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "aa6024fc12ddfd1c6dbc14a80fa2324d4568849869b779f6bd37e5e4c03344d1" +"checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" +"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" +"checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc" +"checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737" +"checksum vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "def296d3eb3b12371b2c7d0e83bfe1403e4db2d7a0bba324a12b21c4ee13143d" +"checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" +"checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0" +"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +"checksum winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7168bab6e1daee33b4557efd0e95d5ca70a03706d39fa5f3fe7a236f584b03c9" +"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +"checksum wincolor 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "561ed901ae465d6185fa7864d63fbd5720d0ef718366c9a4dc83cf6170d7e9ba" +"checksum zstd-sys 1.4.9+zstd.1.3.8 (git+https://github.com/gyscos/zstd-rs.git)" = "" diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml new file mode 100644 index 000000000..f903a8ac0 --- /dev/null +++ b/meilidb/Cargo.toml @@ -0,0 +1,50 @@ +[package] +edition = "2018" +name = "meilidb" +version = "0.3.1" +authors = ["Kerollmops "] + +[dependencies] +arc-swap = "0.3.7" +bincode = "1.1.2" +byteorder = "1.3.1" +fst = "0.3.3" +hashbrown = { version = "0.1.8", features = ["serde"] } +linked-hash-map = { version = "0.5.1", features = ["serde_impl"] } +lockfree = "0.5.1" +log = "0.4.6" +sdset = "0.3.1" +serde = "1.0.88" +serde_derive = "1.0.88" +serde_json = { version = "1.0.38", features = ["preserve_order"] } +size_format = "1.0.2" +slice-group-by = "0.2.4" +unidecode = "0.3.0" +meilidb-core = { path = "../meilidb-core", version = "0.1.0" } + +[dependencies.toml] +git = "https://github.com/Kerollmops/toml-rs.git" +features = ["preserve_order"] +rev = "0372ba6" + +[dependencies.rocksdb] +git = "https://github.com/pingcap/rust-rocksdb.git" +rev = "306e201" + +[features] +default = ["simd"] +i128 = ["bincode/i128"] +portable = ["rocksdb/portable"] +simd = ["rocksdb/sse"] +nightly = ["hashbrown/nightly", "slice-group-by/nightly"] + +[dev-dependencies] +csv = "1.0.5" +env_logger = "0.6.0" +jemallocator = "0.1.9" +quickcheck = "0.8.2" +rand = "0.6.5" +rand_xorshift = "0.1.1" +structopt = "0.2.14" +tempfile = "3.0.7" +termcolor = "1.0.4" diff --git a/src/common_words.rs b/meilidb/src/common_words.rs similarity index 100% rename from src/common_words.rs rename to meilidb/src/common_words.rs diff --git a/src/database/config.rs b/meilidb/src/database/config.rs similarity index 100% rename from src/database/config.rs rename to meilidb/src/database/config.rs diff --git a/src/database/document_key.rs b/meilidb/src/database/document_key.rs similarity index 99% rename from src/database/document_key.rs rename to meilidb/src/database/document_key.rs index 52fd428f8..d6b9865ef 100644 --- a/src/database/document_key.rs +++ b/meilidb/src/database/document_key.rs @@ -5,7 +5,7 @@ use std::fmt; use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt}; use crate::database::schema::SchemaAttr; -use crate::DocumentId; +use meilidb_core::DocumentId; const DOC_KEY_LEN: usize = 4 + size_of::(); const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::(); diff --git a/src/database/mod.rs b/meilidb/src/database/mod.rs similarity index 99% rename from src/database/mod.rs rename to meilidb/src/database/mod.rs index 70ca62d92..727a30bac 100644 --- a/src/database/mod.rs +++ b/meilidb/src/database/mod.rs @@ -17,9 +17,9 @@ use hashbrown::HashMap; use log::{info, error, warn}; use crate::database::schema::SchemaAttr; -use crate::shared_data_cursor::FromSharedDataCursor; -use crate::write_to_bytes::WriteToBytes; -use crate::DocumentId; +use meilidb_core::shared_data_cursor::FromSharedDataCursor; +use meilidb_core::write_to_bytes::WriteToBytes; +use meilidb_core::{Index, DocumentId}; use self::update::{ReadIndexEvent, ReadRankedMapEvent}; @@ -29,7 +29,6 @@ pub use self::view::{DatabaseView, DocumentIter}; pub use self::update::Update; pub use self::serde::SerializerError; pub use self::schema::Schema; -pub use self::index::Index; pub use self::number::{Number, ParseNumberError}; pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>; @@ -41,7 +40,6 @@ const CONFIG: &[u8] = b"config"; pub mod config; pub mod schema; -pub(crate) mod index; mod number; mod document_key; mod serde; diff --git a/src/database/number.rs b/meilidb/src/database/number.rs similarity index 100% rename from src/database/number.rs rename to meilidb/src/database/number.rs diff --git a/src/database/schema.rs b/meilidb/src/database/schema.rs similarity index 99% rename from src/database/schema.rs rename to meilidb/src/database/schema.rs index fc64ffccc..b4e0a070c 100644 --- a/src/database/schema.rs +++ b/meilidb/src/database/schema.rs @@ -10,7 +10,7 @@ use linked_hash_map::LinkedHashMap; use crate::database::serde::find_id::FindDocumentIdSerializer; use crate::database::serde::SerializerError; -use crate::DocumentId; +use meilidb_core::DocumentId; pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false }; pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false }; diff --git a/src/database/serde/deserializer.rs b/meilidb/src/database/serde/deserializer.rs similarity index 99% rename from src/database/serde/deserializer.rs rename to meilidb/src/database/serde/deserializer.rs index 26d74984d..92374ab48 100644 --- a/src/database/serde/deserializer.rs +++ b/meilidb/src/database/serde/deserializer.rs @@ -10,7 +10,7 @@ use serde::de::{self, Visitor, IntoDeserializer}; use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; use crate::database::schema::Schema; -use crate::DocumentId; +use meilidb_core::DocumentId; pub struct Deserializer<'a, D> where D: Deref diff --git a/src/database/serde/find_id.rs b/meilidb/src/database/serde/find_id.rs similarity index 99% rename from src/database/serde/find_id.rs rename to meilidb/src/database/serde/find_id.rs index 98e2e8036..3c44b5e35 100644 --- a/src/database/serde/find_id.rs +++ b/meilidb/src/database/serde/find_id.rs @@ -3,7 +3,7 @@ use serde::ser; use crate::database::serde::key_to_string::KeyToStringSerializer; use crate::database::serde::{SerializerError, calculate_hash}; -use crate::DocumentId; +use meilidb_core::DocumentId; pub struct FindDocumentIdSerializer<'a> { pub id_attribute_name: &'a str, diff --git a/src/database/serde/indexer_serializer.rs b/meilidb/src/database/serde/indexer_serializer.rs similarity index 99% rename from src/database/serde/indexer_serializer.rs rename to meilidb/src/database/serde/indexer_serializer.rs index c25ffe98c..2734fb3be 100644 --- a/src/database/serde/indexer_serializer.rs +++ b/meilidb/src/database/serde/indexer_serializer.rs @@ -2,13 +2,14 @@ use std::collections::HashSet; use serde::Serialize; use serde::ser; +use meilidb_core::{DocumentId, DocIndex}; use crate::database::update::DocumentUpdate; use crate::database::serde::SerializerError; use crate::database::schema::SchemaAttr; use crate::tokenizer::TokenizerBuilder; use crate::tokenizer::Token; -use crate::{is_cjk, DocumentId, DocIndex}; +use crate::is_cjk; pub struct IndexerSerializer<'a, 'b, B> { pub tokenizer_builder: &'a B, diff --git a/src/database/serde/key_to_string.rs b/meilidb/src/database/serde/key_to_string.rs similarity index 100% rename from src/database/serde/key_to_string.rs rename to meilidb/src/database/serde/key_to_string.rs diff --git a/src/database/serde/mod.rs b/meilidb/src/database/serde/mod.rs similarity index 100% rename from src/database/serde/mod.rs rename to meilidb/src/database/serde/mod.rs diff --git a/src/database/serde/serializer.rs b/meilidb/src/database/serde/serializer.rs similarity index 99% rename from src/database/serde/serializer.rs rename to meilidb/src/database/serde/serializer.rs index 2f41bb82c..7e38f938e 100644 --- a/src/database/serde/serializer.rs +++ b/meilidb/src/database/serde/serializer.rs @@ -10,7 +10,7 @@ use crate::database::update::DocumentUpdate; use crate::database::serde::SerializerError; use crate::tokenizer::TokenizerBuilder; use crate::database::schema::Schema; -use crate::DocumentId; +use meilidb_core::DocumentId; pub struct Serializer<'a, 'b, B> { pub schema: &'a Schema, diff --git a/src/database/serde/value_to_number.rs b/meilidb/src/database/serde/value_to_number.rs similarity index 100% rename from src/database/serde/value_to_number.rs rename to meilidb/src/database/serde/value_to_number.rs diff --git a/src/database/update/index_event.rs b/meilidb/src/database/update/index_event.rs similarity index 90% rename from src/database/update/index_event.rs rename to meilidb/src/database/update/index_event.rs index cd006aa3c..20dbcbf46 100644 --- a/src/database/update/index_event.rs +++ b/meilidb/src/database/update/index_event.rs @@ -1,11 +1,11 @@ use std::error::Error; use byteorder::{ReadBytesExt, WriteBytesExt}; +use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; +use meilidb_core::write_to_bytes::WriteToBytes; +use meilidb_core::data::DocIds; -use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; -use crate::write_to_bytes::WriteToBytes; use crate::database::Index; -use crate::data::DocIds; pub enum WriteIndexEvent<'a> { RemovedDocuments(&'a DocIds), diff --git a/src/database/update/mod.rs b/meilidb/src/database/update/mod.rs similarity index 98% rename from src/database/update/mod.rs rename to meilidb/src/database/update/mod.rs index 548fb8bc2..eaae462b2 100644 --- a/src/database/update/mod.rs +++ b/meilidb/src/database/update/mod.rs @@ -5,19 +5,18 @@ use rocksdb::rocksdb::{Writable, WriteBatch}; use hashbrown::hash_map::HashMap; use sdset::{Set, SetBuf}; use serde::Serialize; +use meilidb_core::write_to_bytes::WriteToBytes; +use meilidb_core::data::DocIds; +use meilidb_core::{IndexBuilder, DocumentId, DocIndex}; use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; use crate::database::serde::serializer::Serializer; use crate::database::serde::SerializerError; use crate::database::schema::SchemaAttr; use crate::database::schema::Schema; -use crate::database::index::IndexBuilder; use crate::database::{DATA_INDEX, DATA_RANKED_MAP}; use crate::database::{RankedMap, Number}; use crate::tokenizer::TokenizerBuilder; -use crate::write_to_bytes::WriteToBytes; -use crate::data::DocIds; -use crate::{DocumentId, DocIndex}; pub use self::index_event::{ReadIndexEvent, WriteIndexEvent}; pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent}; diff --git a/src/database/update/ranked_map_event.rs b/meilidb/src/database/update/ranked_map_event.rs similarity index 90% rename from src/database/update/ranked_map_event.rs rename to meilidb/src/database/update/ranked_map_event.rs index 5a51f8799..428bc62cf 100644 --- a/src/database/update/ranked_map_event.rs +++ b/meilidb/src/database/update/ranked_map_event.rs @@ -1,11 +1,11 @@ use std::error::Error; use byteorder::{ReadBytesExt, WriteBytesExt}; +use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; +use meilidb_core::write_to_bytes::WriteToBytes; +use meilidb_core::data::DocIds; -use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; -use crate::write_to_bytes::WriteToBytes; use crate::database::RankedMap; -use crate::data::DocIds; pub enum WriteRankedMapEvent<'a> { RemovedDocuments(&'a DocIds), diff --git a/src/database/view.rs b/meilidb/src/database/view.rs similarity index 98% rename from src/database/view.rs rename to meilidb/src/database/view.rs index b1fbc0bdd..8eb21a4c8 100644 --- a/src/database/view.rs +++ b/meilidb/src/database/view.rs @@ -6,16 +6,15 @@ use std::{fmt, marker}; use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions}; use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter}; use serde::de::DeserializeOwned; +use meilidb_core::{Index, QueryBuilder, FilterFunc}; +use meilidb_core::DocumentId; use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map, retrieve_config}; use crate::database::serde::deserializer::Deserializer; use crate::database::{DocumentKey, DocumentKeyAttr}; -use crate::rank::{QueryBuilder, FilterFunc}; use crate::database::schema::Schema; -use crate::database::index::Index; use crate::database::RankedMap; use crate::database::Config; -use crate::DocumentId; pub struct DatabaseView where D: Deref diff --git a/meilidb/src/lib.rs b/meilidb/src/lib.rs new file mode 100644 index 000000000..ff4df44ea --- /dev/null +++ b/meilidb/src/lib.rs @@ -0,0 +1,22 @@ +#![cfg_attr(feature = "nightly", feature(test))] + +pub mod database; +pub mod tokenizer; +mod common_words; + +pub use rocksdb; + +pub use self::tokenizer::Tokenizer; +pub use self::common_words::CommonWords; + +pub fn is_cjk(c: char) -> bool { + (c >= '\u{2e80}' && c <= '\u{2eff}') || + (c >= '\u{2f00}' && c <= '\u{2fdf}') || + (c >= '\u{3040}' && c <= '\u{309f}') || + (c >= '\u{30a0}' && c <= '\u{30ff}') || + (c >= '\u{3100}' && c <= '\u{312f}') || + (c >= '\u{3200}' && c <= '\u{32ff}') || + (c >= '\u{3400}' && c <= '\u{4dbf}') || + (c >= '\u{4e00}' && c <= '\u{9fff}') || + (c >= '\u{f900}' && c <= '\u{faff}') +} diff --git a/src/tokenizer/mod.rs b/meilidb/src/tokenizer/mod.rs similarity index 100% rename from src/tokenizer/mod.rs rename to meilidb/src/tokenizer/mod.rs diff --git a/src/lib.rs b/src/lib.rs deleted file mode 100644 index 964de8f75..000000000 --- a/src/lib.rs +++ /dev/null @@ -1,136 +0,0 @@ -#![cfg_attr(feature = "nightly", feature(test))] - -pub mod automaton; -pub mod database; -pub mod data; -pub mod rank; -pub mod tokenizer; -mod common_words; -mod shared_data_cursor; -mod write_to_bytes; - -use serde_derive::{Serialize, Deserialize}; - -pub use rocksdb; - -pub use self::tokenizer::Tokenizer; -pub use self::common_words::CommonWords; - -pub fn is_cjk(c: char) -> bool { - (c >= '\u{2e80}' && c <= '\u{2eff}') || - (c >= '\u{2f00}' && c <= '\u{2fdf}') || - (c >= '\u{3040}' && c <= '\u{309f}') || - (c >= '\u{30a0}' && c <= '\u{30ff}') || - (c >= '\u{3100}' && c <= '\u{312f}') || - (c >= '\u{3200}' && c <= '\u{32ff}') || - (c >= '\u{3400}' && c <= '\u{4dbf}') || - (c >= '\u{4e00}' && c <= '\u{9fff}') || - (c >= '\u{f900}' && c <= '\u{faff}') -} - -/// Represent an internally generated document unique identifier. -/// -/// It is used to inform the database the document you want to deserialize. -/// Helpful for custom ranking. -#[derive(Serialize, Deserialize)] -#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] -pub struct DocumentId(u64); - -/// This structure represent the position of a word -/// in a document and its attributes. -/// -/// This is stored in the map, generated at index time, -/// extracted and interpreted at search time. -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -#[repr(C)] -pub struct DocIndex { - /// The document identifier where the word was found. - pub document_id: DocumentId, - - /// The attribute in the document where the word was found - /// along with the index in it. - pub attribute: u16, - pub word_index: u16, - - /// The position in bytes where the word was found - /// along with the length of it. - /// - /// It informs on the original word area in the text indexed - /// without needing to run the tokenizer again. - pub char_index: u16, - pub char_length: u16, -} - -/// This structure represent a matching word with informations -/// on the location of the word in the document. -/// -/// The order of the field is important because it defines -/// the way these structures are ordered between themselves. -/// -/// The word in itself is not important. -// TODO do data oriented programming ? very arrays ? -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct Match { - /// The word index in the query sentence. - /// Same as the `attribute_index` but for the query words. - /// - /// Used to retrieve the automaton that match this word. - pub query_index: u32, - - /// The distance the word has with the query word - /// (i.e. the Levenshtein distance). - pub distance: u8, - - /// The attribute in the document where the word was found - /// along with the index in it. - pub attribute: u16, - pub word_index: u16, - - /// Whether the word that match is an exact match or a prefix. - pub is_exact: bool, - - /// The position in bytes where the word was found - /// along with the length of it. - /// - /// It informs on the original word area in the text indexed - /// without needing to run the tokenizer again. - pub char_index: u16, - pub char_length: u16, -} - -impl Match { - pub fn zero() -> Self { - Match { - query_index: 0, - distance: 0, - attribute: 0, - word_index: 0, - is_exact: false, - char_index: 0, - char_length: 0, - } - } - - pub fn max() -> Self { - Match { - query_index: u32::max_value(), - distance: u8::max_value(), - attribute: u16::max_value(), - word_index: u16::max_value(), - is_exact: true, - char_index: u16::max_value(), - char_length: u16::max_value(), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::mem; - - #[test] - fn docindex_mem_size() { - assert_eq!(mem::size_of::(), 16); - } -} From d0786b415609f5e65d73aa6a90cff4a5aba571e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 24 Feb 2019 19:53:52 +0100 Subject: [PATCH 02/44] chore: Move the SortByAttr into meilidb --- meilidb-core/src/criterion/mod.rs | 2 -- meilidb/src/lib.rs | 2 ++ {meilidb-core/src/criterion => meilidb/src}/sort_by_attr.rs | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) rename {meilidb-core/src/criterion => meilidb/src}/sort_by_attr.rs (97%) diff --git a/meilidb-core/src/criterion/mod.rs b/meilidb-core/src/criterion/mod.rs index 3e2fd5028..2ad3a183c 100644 --- a/meilidb-core/src/criterion/mod.rs +++ b/meilidb-core/src/criterion/mod.rs @@ -4,7 +4,6 @@ mod words_proximity; mod sum_of_words_attribute; mod sum_of_words_position; mod exact; -// mod sort_by_attr; mod document_id; use std::cmp::Ordering; @@ -17,7 +16,6 @@ pub use self::{ sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition, exact::Exact, - // sort_by_attr::SortByAttr, document_id::DocumentId, }; diff --git a/meilidb/src/lib.rs b/meilidb/src/lib.rs index ff4df44ea..73de9ff4d 100644 --- a/meilidb/src/lib.rs +++ b/meilidb/src/lib.rs @@ -3,9 +3,11 @@ pub mod database; pub mod tokenizer; mod common_words; +mod sort_by_attr; pub use rocksdb; +pub use self::sort_by_attr::SortByAttr; pub use self::tokenizer::Tokenizer; pub use self::common_words::CommonWords; diff --git a/meilidb-core/src/criterion/sort_by_attr.rs b/meilidb/src/sort_by_attr.rs similarity index 97% rename from meilidb-core/src/criterion/sort_by_attr.rs rename to meilidb/src/sort_by_attr.rs index 8b7b23fa6..24364aaf4 100644 --- a/meilidb-core/src/criterion/sort_by_attr.rs +++ b/meilidb/src/sort_by_attr.rs @@ -2,10 +2,11 @@ use std::cmp::Ordering; use std::error::Error; use std::fmt; +use meilidb_core::criterion::Criterion; +use meilidb_core::RawDocument; + use crate::database::schema::{Schema, SchemaAttr}; -use crate::criterion::Criterion; use crate::database::RankedMap; -use crate::RawDocument; /// An helper struct that permit to sort documents by /// some of their stored attributes. From 3bcb1dc802a0f2b7134c13fa73f951425cf6356e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 24 Feb 2019 19:55:29 +0100 Subject: [PATCH 03/44] chore: Allow the activation of the meilidb-core i128 feature --- meilidb/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml index f903a8ac0..01538e878 100644 --- a/meilidb/Cargo.toml +++ b/meilidb/Cargo.toml @@ -33,7 +33,7 @@ rev = "306e201" [features] default = ["simd"] -i128 = ["bincode/i128"] +i128 = ["bincode/i128", "meilidb-core/i128"] portable = ["rocksdb/portable"] simd = ["rocksdb/sse"] nightly = ["hashbrown/nightly", "slice-group-by/nightly"] From bc227bef21218b8219d2d62db4b9f9ad52244018 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 24 Feb 2019 19:57:52 +0100 Subject: [PATCH 04/44] chore: Add a nightly feature to meilidb-core --- meilidb-core/Cargo.toml | 1 + meilidb/Cargo.toml | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index 5523a1331..2c5ec0680 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -19,3 +19,4 @@ slice-group-by = "0.2.4" [features] i128 = ["byteorder/i128"] +nightly = ["hashbrown/nightly", "slice-group-by/nightly"] diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml index 01538e878..1600feb04 100644 --- a/meilidb/Cargo.toml +++ b/meilidb/Cargo.toml @@ -36,7 +36,11 @@ default = ["simd"] i128 = ["bincode/i128", "meilidb-core/i128"] portable = ["rocksdb/portable"] simd = ["rocksdb/sse"] -nightly = ["hashbrown/nightly", "slice-group-by/nightly"] +nightly = [ + "hashbrown/nightly", + "slice-group-by/nightly", + "meilidb-core/nightly" +] [dev-dependencies] csv = "1.0.5" From d8cbb03c428be5588faedd9e84b4951d3ebb20ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 24 Feb 2019 20:02:25 +0100 Subject: [PATCH 05/44] chore: Update the .gitignore file --- .gitignore | 3 +- meilidb/Cargo.lock | 1072 -------------------------------------------- 2 files changed, 2 insertions(+), 1073 deletions(-) delete mode 100644 meilidb/Cargo.lock diff --git a/.gitignore b/.gitignore index 5768350a8..c38aa51d3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ -/rocksdb /target /Cargo.lock +meilidb/Cargo.lock +meilidb-core/Cargo.lock **/*.rs.bk **/*.csv **/*.json_lines diff --git a/meilidb/Cargo.lock b/meilidb/Cargo.lock deleted file mode 100644 index 1a32c8b9e..000000000 --- a/meilidb/Cargo.lock +++ /dev/null @@ -1,1072 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -[[package]] -name = "aho-corasick" -version = "0.6.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "ansi_term" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "arc-swap" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "arrayvec" -version = "0.4.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "atty" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", - "termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "autocfg" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "bincode" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", - "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "bitflags" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "build_const" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "byteorder" -version = "1.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "bzip2-sys" -version = "0.1.7" -source = "git+https://github.com/alexcrichton/bzip2-rs.git#18fd3e18bc1763219a7496e466a16bd213448fec" -dependencies = [ - "cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "cc" -version = "1.0.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "rayon 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "cfg-if" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "clap" -version = "2.32.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", - "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", - "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", - "strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", - "textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)", - "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", - "vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "cloudabi" -version = "0.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "cmake" -version = "0.1.35" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "crc" -version = "1.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "crossbeam-deque" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "crossbeam-epoch 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", - "crossbeam-utils 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "arrayvec 0.4.10 (registry+https://github.com/rust-lang/crates.io-index)", - "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", - "crossbeam-utils 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", - "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", - "nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)", - "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "crossbeam-utils" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "csv" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "csv-core 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "csv-core" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "either" -version = "1.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "env_logger" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", - "humantime 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", - "termcolor 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "fs_extra" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "fst" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", - "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "fuchsia-cprng" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "generic-array" -version = "0.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "typenum 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "glob" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "hashbrown" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", - "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "heck" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "humantime" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "indexmap" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "itoa" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "jemalloc-sys" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)", - "fs_extra 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "jemallocator" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "jemalloc-sys 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "lazy_static" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "levenshtein_automata" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "fst 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "libc" -version = "0.2.49" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "librocksdb_sys" -version = "0.1.0" -source = "git+https://github.com/pingcap/rust-rocksdb.git?rev=306e201#306e2010429873a1d1d979b70f0d30e437dddc6c" -dependencies = [ - "bzip2-sys 0.1.7 (git+https://github.com/alexcrichton/bzip2-rs.git)", - "cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)", - "cmake 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", - "libz-sys 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)", - "lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)", - "snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)", - "zstd-sys 1.4.9+zstd.1.3.8 (git+https://github.com/gyscos/zstd-rs.git)", -] - -[[package]] -name = "libz-sys" -version = "1.0.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", - "pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)", - "vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "linked-hash-map" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_test 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "lockfree" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "owned-alloc 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "log" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "lz4-sys" -version = "1.8.0" -source = "git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build#41509fea212e9ca55c1f6c53d4fd1ddf28cdf689" -dependencies = [ - "cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "meilidb" -version = "0.3.1" -dependencies = [ - "arc-swap 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", - "bincode 1.1.2 (registry+https://github.com/rust-lang/crates.io-index)", - "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", - "csv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", - "env_logger 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", - "fst 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", - "hashbrown 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", - "jemallocator 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", - "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", - "linked-hash-map 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", - "lockfree 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", - "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", - "quickcheck 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)", - "rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", - "rayon 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", - "rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git?rev=306e201)", - "sdset 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_derive 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_json 1.0.38 (registry+https://github.com/rust-lang/crates.io-index)", - "size_format 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", - "slice-group-by 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)", - "structopt 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)", - "tempfile 3.0.7 (registry+https://github.com/rust-lang/crates.io-index)", - "termcolor 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", - "toml 0.5.0 (git+https://github.com/Kerollmops/toml-rs.git?rev=0372ba6)", - "unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "memchr" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "memmap" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "memoffset" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "nodrop" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "num" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "num-complex 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", - "num-integer 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)", - "num-iter 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)", - "num-rational 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", - "num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "num-complex" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "num-integer" -version = "0.1.39" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "num-iter" -version = "0.1.37" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "num-integer 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)", - "num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "num-rational" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "num-integer 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)", - "num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "num-traits" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "num_cpus" -version = "1.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "owned-alloc" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "pkg-config" -version = "0.3.14" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "proc-macro2" -version = "0.4.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "quick-error" -version = "1.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "quickcheck" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "env_logger 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", - "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", - "rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "quote" -version = "0.6.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "rand" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_chacha 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_hc 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_isaac 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_jitter 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_os 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_pcg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "rand_chacha" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "rand_core" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "rand_core" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "rand_hc" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "rand_isaac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "rand_jitter" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "rand_os" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)", - "fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "rand_pcg" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "rand_xorshift" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "rayon" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "crossbeam-deque 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "either 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)", - "rayon-core 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "rayon-core" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "crossbeam-deque 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", - "num_cpus 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "rdrand" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "redox_syscall" -version = "0.1.51" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "redox_termios" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "regex" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "aho-corasick 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", - "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "regex-syntax 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", - "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", - "utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "regex-syntax" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "remove_dir_all" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "rocksdb" -version = "0.3.0" -source = "git+https://github.com/pingcap/rust-rocksdb.git?rev=306e201#306e2010429873a1d1d979b70f0d30e437dddc6c" -dependencies = [ - "crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", - "librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git?rev=306e201)", -] - -[[package]] -name = "ryu" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "scopeguard" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "sdset" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "serde" -version = "1.0.88" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "serde_derive" -version = "1.0.88" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)", - "quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 0.15.26 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "serde_json" -version = "1.0.38" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", - "itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", - "ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "serde_test" -version = "1.0.88" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "size_format" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "generic-array 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)", - "num 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "slice-group-by" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "snappy-sys" -version = "0.1.0" -source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#be02178330bb17648d6ac605af249eba18b32b71" -dependencies = [ - "cmake 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", - "pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "strsim" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "structopt" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", - "structopt-derive 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "structopt-derive" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "heck 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", - "proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)", - "quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 0.15.26 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "syn" -version = "0.15.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)", - "quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)", - "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "tempfile" -version = "3.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", - "rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", - "redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)", - "remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "termcolor" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "wincolor 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "termion" -version = "1.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", - "redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)", - "redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "textwrap" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "thread_local" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "toml" -version = "0.5.0" -source = "git+https://github.com/Kerollmops/toml-rs.git?rev=0372ba6#0372ba6925aa2c6db4d27022562064e25cdc5312" -dependencies = [ - "linked-hash-map 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "typenum" -version = "1.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "ucd-util" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "unicode-segmentation" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "unicode-width" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "unicode-xid" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "unidecode" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "utf8-ranges" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "vcpkg" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "vec_map" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "winapi" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "winapi-util" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "wincolor" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "zstd-sys" -version = "1.4.9+zstd.1.3.8" -source = "git+https://github.com/gyscos/zstd-rs.git#d51f87c668932670b9aced48d1b750506c211f11" -dependencies = [ - "cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)", - "glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[metadata] -"checksum aho-corasick 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "81ce3d38065e618af2d7b77e10c5ad9a069859b4be3c2250f674af3840d9c8a5" -"checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" -"checksum arc-swap 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "1025aeae2b664ca0ea726a89d574fe8f4e77dd712d443236ad1de00379450cf6" -"checksum arrayvec 0.4.10 (registry+https://github.com/rust-lang/crates.io-index)" = "92c7fb76bc8826a8b33b4ee5bb07a247a81e76764ab4d55e8f73e3a4d8808c71" -"checksum atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "9a7d5b8723950951411ee34d271d99dddcc2035a16ab25310ea2c8cfd4369652" -"checksum autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a6d640bee2da49f60a4068a7fae53acde8982514ab7bae8b8cea9e88cbcfd799" -"checksum bincode 1.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3efe0b4c8eaeed8600549c29f538a6a11bf422858d0ed435b1d70ec4ab101190" -"checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12" -"checksum build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39092a32794787acd8525ee150305ff051b0aa6cc2abaf193924f5ab05425f39" -"checksum byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a019b10a2a7cdeb292db131fc8113e57ea2a908f6e7894b0c3c671893b65dbeb" -"checksum bzip2-sys 0.1.7 (git+https://github.com/alexcrichton/bzip2-rs.git)" = "" -"checksum cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)" = "4390a3b5f4f6bce9c1d0c00128379df433e53777fdd30e92f16a529332baec4e" -"checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4" -"checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e" -"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" -"checksum cmake 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)" = "6ec65ee4f9c9d16f335091d23693457ed4928657ba4982289d7fafee03bc614a" -"checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb" -"checksum crossbeam-deque 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f739f8c5363aca78cfb059edf753d8f0d36908c348f3d8d1503f03d8b75d9cf3" -"checksum crossbeam-epoch 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "927121f5407de9956180ff5e936fe3cf4324279280001cd56b669d28ee7e9150" -"checksum crossbeam-utils 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "2760899e32a1d58d5abb31129f8fae5de75220bc2176e77ff7c627ae45c918d9" -"checksum csv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "9fd1c44c58078cfbeaf11fbb3eac9ae5534c23004ed770cc4bfb48e658ae4f04" -"checksum csv-core 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fa5cdef62f37e6ffe7d1f07a381bc0db32b7a3ff1cac0de56cb0d81e71f53d65" -"checksum either 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c67353c641dc847124ea1902d69bd753dee9bb3beff9aa3662ecf86c971d1fac" -"checksum env_logger 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "afb070faf94c85d17d50ca44f6ad076bce18ae92f0037d350947240a36e9d42e" -"checksum fs_extra 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5f2a4a2034423744d2cc7ca2068453168dcdb82c438419e639a26bd87839c674" -"checksum fst 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "db72126ca7dff566cdbbdd54af44668c544897d9d3862b198141f176f1238bdf" -"checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" -"checksum generic-array 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3c0f28c2f5bfb5960175af447a2da7c18900693738343dc896ffbcabd9839592" -"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" -"checksum hashbrown 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "3bae29b6653b3412c2e71e9d486db9f9df5d701941d86683005efb9f2d28e3da" -"checksum heck 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205" -"checksum humantime 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3ca7e5f2e110db35f93b837c81797f3714500b81d517bf20c431b16d3ca4f114" -"checksum indexmap 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7e81a7c05f79578dbc15793d8b619db9ba32b4577003ef3af1a91c416798c58d" -"checksum itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1306f3464951f30e30d12373d31c79fbd52d236e5e896fd92f96ec7babbbe60b" -"checksum jemalloc-sys 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "bfc62c8e50e381768ce8ee0428ee53741929f7ebd73e4d83f669bcf7693e00ae" -"checksum jemallocator 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "9f0cd42ac65f758063fea55126b0148b1ce0a6354ff78e07a4d6806bc65c4ab3" -"checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1" -"checksum levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "73a004f877f468548d8d0ac4977456a249d8fabbdb8416c36db163dfc8f2e8ca" -"checksum libc 0.2.49 (registry+https://github.com/rust-lang/crates.io-index)" = "413f3dfc802c5dc91dc570b05125b6cda9855edfaa9825c9849807876376e70e" -"checksum librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git?rev=306e201)" = "" -"checksum libz-sys 1.0.25 (registry+https://github.com/rust-lang/crates.io-index)" = "2eb5e43362e38e2bca2fd5f5134c4d4564a23a5c28e9b95411652021a8675ebe" -"checksum linked-hash-map 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "70fb39025bc7cdd76305867c4eccf2f2dcf6e9a57f5b21a93e1c2d86cd03ec9e" -"checksum lockfree 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "74ee94b5ad113c7cb98c5a040f783d0952ee4fe100993881d1673c2cb002dd23" -"checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6" -"checksum lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)" = "" -"checksum memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2efc7bc57c883d4a4d6e3246905283d8dae951bb3bd32f49d6ef297f546e1c39" -"checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff" -"checksum memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0f9dc261e2b62d7a622bf416ea3c5245cdd5d9a7fcc428c0d06804dfce1775b3" -"checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" -"checksum num 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "cf4825417e1e1406b3782a8ce92f4d53f26ec055e3622e1881ca8e9f5f9e08db" -"checksum num-complex 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "107b9be86cd2481930688277b675b0114578227f034674726605b8a482d8baf8" -"checksum num-integer 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)" = "e83d528d2677f0518c570baf2b7abdcf0cd2d248860b68507bdcb3e91d4c0cea" -"checksum num-iter 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)" = "af3fdbbc3291a5464dc57b03860ec37ca6bf915ed6ee385e7c6c052c422b2124" -"checksum num-rational 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4e96f040177bb3da242b5b1ecf3f54b5d5af3efbbfb18608977a5d2767b22f10" -"checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1" -"checksum num_cpus 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1a23f0ed30a54abaa0c7e83b1d2d87ada7c3c23078d1d87815af3e3b6385fbba" -"checksum owned-alloc 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "30fceb411f9a12ff9222c5f824026be368ff15dc2f13468d850c7d3f502205d6" -"checksum pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "676e8eb2b1b4c9043511a9b7bea0915320d7e502b0a079fb03f9635a5252b18c" -"checksum proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)" = "4d317f9caece796be1980837fd5cb3dfec5613ebdb04ad0956deea83ce168915" -"checksum quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9274b940887ce9addde99c4eee6b5c44cc494b182b97e73dc8ffdcb3397fd3f0" -"checksum quickcheck 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3568ae5409428feef71bf062778bf5acfadc3d496b7696afa829f9eef70e17dc" -"checksum quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)" = "cdd8e04bd9c52e0342b406469d494fcb033be4bdbe5c606016defbb1681411e1" -"checksum rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" -"checksum rand_chacha 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" -"checksum rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" -"checksum rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d0e7a549d590831370895ab7ba4ea0c1b6b011d106b5ff2da6eee112615e6dc0" -"checksum rand_hc 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4" -"checksum rand_isaac 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08" -"checksum rand_jitter 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "7b9ea758282efe12823e0d952ddb269d2e1897227e464919a554f2a03ef1b832" -"checksum rand_os 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b7c690732391ae0abafced5015ffb53656abfaec61b342290e5eb56b286a679d" -"checksum rand_pcg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44" -"checksum rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" -"checksum rayon 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "373814f27745b2686b350dd261bfd24576a6fb0e2c5919b3a2b6005f820b0473" -"checksum rayon-core 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b055d1e92aba6877574d8fe604a63c8b5df60f60e5982bf7ccbb1338ea527356" -"checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" -"checksum redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)" = "423e376fffca3dfa06c9e9790a9ccd282fafb3cc6e6397d01dbf64f9bacc6b85" -"checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" -"checksum regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "37e7cbbd370869ce2e8dff25c7018702d10b21a20ef7135316f8daecd6c25b7f" -"checksum regex-syntax 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "8c2f35eedad5295fdf00a63d7d4b238135723f92b434ec06774dad15c7ab0861" -"checksum remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3488ba1b9a2084d38645c4c08276a1752dcbf2c7130d74f1569681ad5d2799c5" -"checksum rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git?rev=306e201)" = "" -"checksum ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "eb9e9b8cde282a9fe6a42dd4681319bfb63f121b8a8ee9439c6f4107e58a46f7" -"checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27" -"checksum sdset 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "876890e4982cfbf82aa77cf73df0c31812a912fb89fd454e02ef21ba5d3cac3b" -"checksum serde 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)" = "9f301d728f2b94c9a7691c90f07b0b4e8a4517181d9461be94c04bddeb4bd850" -"checksum serde_derive 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)" = "beed18e6f5175aef3ba670e57c60ef3b1b74d250d962a26604bff4c80e970dd4" -"checksum serde_json 1.0.38 (registry+https://github.com/rust-lang/crates.io-index)" = "27dce848e7467aa0e2fcaf0a413641499c0b745452aaca1194d24dedde9e13c9" -"checksum serde_test 1.0.88 (registry+https://github.com/rust-lang/crates.io-index)" = "edb44ae54ee0ddf787ad6a5f4769cd61967cafe8ed4ef1b5189c10af73f689e2" -"checksum size_format 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6ed5f6ab2122c6dec69dca18c72fa4590a27e581ad20d44960fe74c032a0b23b" -"checksum slice-group-by 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "049599674ed27c9b78b93265482068999c0fc71116e186ea4a408e9fc47723b0" -"checksum snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)" = "" -"checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" -"checksum structopt 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)" = "670ad348dc73012fcf78c71f06f9d942232cdd4c859d4b6975e27836c3efc0c3" -"checksum structopt-derive 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)" = "ef98172b1a00b0bec738508d3726540edcbd186d50dfd326f2b1febbb3559f04" -"checksum syn 0.15.26 (registry+https://github.com/rust-lang/crates.io-index)" = "f92e629aa1d9c827b2bb8297046c1ccffc57c99b947a680d3ccff1f136a3bee9" -"checksum tempfile 3.0.7 (registry+https://github.com/rust-lang/crates.io-index)" = "b86c784c88d98c801132806dadd3819ed29d8600836c4088e855cdf3e178ed8a" -"checksum termcolor 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4096add70612622289f2fdcdbd5086dc81c1e2675e6ae58d6c4f62a16c6d7f2f" -"checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096" -"checksum textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "307686869c93e71f94da64286f9a9524c0f308a9e1c87a583de8e9c9039ad3f6" -"checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" -"checksum toml 0.5.0 (git+https://github.com/Kerollmops/toml-rs.git?rev=0372ba6)" = "" -"checksum typenum 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "612d636f949607bdf9b123b4a6f6d966dedf3ff669f7f045890d3a4a73948169" -"checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86" -"checksum unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "aa6024fc12ddfd1c6dbc14a80fa2324d4568849869b779f6bd37e5e4c03344d1" -"checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" -"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" -"checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc" -"checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737" -"checksum vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "def296d3eb3b12371b2c7d0e83bfe1403e4db2d7a0bba324a12b21c4ee13143d" -"checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" -"checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0" -"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" -"checksum winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7168bab6e1daee33b4557efd0e95d5ca70a03706d39fa5f3fe7a236f584b03c9" -"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" -"checksum wincolor 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "561ed901ae465d6185fa7864d63fbd5720d0ef718366c9a4dc83cf6170d7e9ba" -"checksum zstd-sys 1.4.9+zstd.1.3.8 (git+https://github.com/gyscos/zstd-rs.git)" = "" From 1897da53483d7b7b36367923e642972b4b8f562f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 25 Feb 2019 18:24:46 +0100 Subject: [PATCH 06/44] feat: Move tokenizer things into the meilidb-tokenizer workspace --- Cargo.toml | 1 + meilidb-core/Cargo.toml | 1 + meilidb-tokenizer/Cargo.toml | 8 ++++++++ .../mod.rs => meilidb-tokenizer/src/lib.rs | 13 ++++++++++++- meilidb/Cargo.toml | 3 ++- meilidb/src/database/mod.rs | 2 +- meilidb/src/database/serde/indexer_serializer.rs | 4 +--- meilidb/src/database/serde/serializer.rs | 2 +- meilidb/src/database/update/mod.rs | 2 +- meilidb/src/lib.rs | 14 -------------- 10 files changed, 28 insertions(+), 22 deletions(-) create mode 100644 meilidb-tokenizer/Cargo.toml rename meilidb/src/tokenizer/mod.rs => meilidb-tokenizer/src/lib.rs (95%) diff --git a/Cargo.toml b/Cargo.toml index df9c871ba..139e8b472 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,4 +2,5 @@ members = [ "meilidb", "meilidb-core", + "meilidb-tokenizer", ] diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index 2c5ec0680..fbac7dbe2 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -11,6 +11,7 @@ hashbrown = "0.1.8" lazy_static = "1.2.0" levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } log = "0.4.6" +meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } rayon = "1.0.3" sdset = "0.3.1" serde = "1.0.88" diff --git a/meilidb-tokenizer/Cargo.toml b/meilidb-tokenizer/Cargo.toml new file mode 100644 index 000000000..c2077533e --- /dev/null +++ b/meilidb-tokenizer/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "meilidb-tokenizer" +version = "0.1.0" +authors = ["Clément Renault "] +edition = "2018" + +[dependencies] + diff --git a/meilidb/src/tokenizer/mod.rs b/meilidb-tokenizer/src/lib.rs similarity index 95% rename from meilidb/src/tokenizer/mod.rs rename to meilidb-tokenizer/src/lib.rs index ed146c06f..7c4c8f915 100644 --- a/meilidb/src/tokenizer/mod.rs +++ b/meilidb-tokenizer/src/lib.rs @@ -1,7 +1,18 @@ use std::mem; -use crate::is_cjk; use self::Separator::*; +pub fn is_cjk(c: char) -> bool { + (c >= '\u{2e80}' && c <= '\u{2eff}') || + (c >= '\u{2f00}' && c <= '\u{2fdf}') || + (c >= '\u{3040}' && c <= '\u{309f}') || + (c >= '\u{30a0}' && c <= '\u{30ff}') || + (c >= '\u{3100}' && c <= '\u{312f}') || + (c >= '\u{3200}' && c <= '\u{32ff}') || + (c >= '\u{3400}' && c <= '\u{4dbf}') || + (c >= '\u{4e00}' && c <= '\u{9fff}') || + (c >= '\u{f900}' && c <= '\u{faff}') +} + pub trait TokenizerBuilder { fn build<'a>(&self, text: &'a str) -> Box> + 'a>; } diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml index 1600feb04..8a042168c 100644 --- a/meilidb/Cargo.toml +++ b/meilidb/Cargo.toml @@ -13,6 +13,8 @@ hashbrown = { version = "0.1.8", features = ["serde"] } linked-hash-map = { version = "0.5.1", features = ["serde_impl"] } lockfree = "0.5.1" log = "0.4.6" +meilidb-core = { path = "../meilidb-core", version = "0.1.0" } +meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } sdset = "0.3.1" serde = "1.0.88" serde_derive = "1.0.88" @@ -20,7 +22,6 @@ serde_json = { version = "1.0.38", features = ["preserve_order"] } size_format = "1.0.2" slice-group-by = "0.2.4" unidecode = "0.3.0" -meilidb-core = { path = "../meilidb-core", version = "0.1.0" } [dependencies.toml] git = "https://github.com/Kerollmops/toml-rs.git" diff --git a/meilidb/src/database/mod.rs b/meilidb/src/database/mod.rs index 727a30bac..08ca6cd7f 100644 --- a/meilidb/src/database/mod.rs +++ b/meilidb/src/database/mod.rs @@ -430,9 +430,9 @@ mod tests { use std::error::Error; use serde_derive::{Serialize, Deserialize}; + use meilidb_tokenizer::DefaultBuilder; use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; - use crate::tokenizer::DefaultBuilder; use super::*; diff --git a/meilidb/src/database/serde/indexer_serializer.rs b/meilidb/src/database/serde/indexer_serializer.rs index 2734fb3be..f718111dc 100644 --- a/meilidb/src/database/serde/indexer_serializer.rs +++ b/meilidb/src/database/serde/indexer_serializer.rs @@ -3,13 +3,11 @@ use std::collections::HashSet; use serde::Serialize; use serde::ser; use meilidb_core::{DocumentId, DocIndex}; +use meilidb_tokenizer::{TokenizerBuilder, Token, is_cjk}; use crate::database::update::DocumentUpdate; use crate::database::serde::SerializerError; use crate::database::schema::SchemaAttr; -use crate::tokenizer::TokenizerBuilder; -use crate::tokenizer::Token; -use crate::is_cjk; pub struct IndexerSerializer<'a, 'b, B> { pub tokenizer_builder: &'a B, diff --git a/meilidb/src/database/serde/serializer.rs b/meilidb/src/database/serde/serializer.rs index 7e38f938e..8b3a05b46 100644 --- a/meilidb/src/database/serde/serializer.rs +++ b/meilidb/src/database/serde/serializer.rs @@ -2,13 +2,13 @@ use std::collections::HashSet; use serde::Serialize; use serde::ser; +use meilidb_tokenizer::TokenizerBuilder; use crate::database::serde::indexer_serializer::IndexerSerializer; use crate::database::serde::key_to_string::KeyToStringSerializer; use crate::database::serde::value_to_number::ValueToNumberSerializer; use crate::database::update::DocumentUpdate; use crate::database::serde::SerializerError; -use crate::tokenizer::TokenizerBuilder; use crate::database::schema::Schema; use meilidb_core::DocumentId; diff --git a/meilidb/src/database/update/mod.rs b/meilidb/src/database/update/mod.rs index eaae462b2..720b7aaf3 100644 --- a/meilidb/src/database/update/mod.rs +++ b/meilidb/src/database/update/mod.rs @@ -8,6 +8,7 @@ use serde::Serialize; use meilidb_core::write_to_bytes::WriteToBytes; use meilidb_core::data::DocIds; use meilidb_core::{IndexBuilder, DocumentId, DocIndex}; +use meilidb_tokenizer::TokenizerBuilder; use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; use crate::database::serde::serializer::Serializer; @@ -16,7 +17,6 @@ use crate::database::schema::SchemaAttr; use crate::database::schema::Schema; use crate::database::{DATA_INDEX, DATA_RANKED_MAP}; use crate::database::{RankedMap, Number}; -use crate::tokenizer::TokenizerBuilder; pub use self::index_event::{ReadIndexEvent, WriteIndexEvent}; pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent}; diff --git a/meilidb/src/lib.rs b/meilidb/src/lib.rs index 73de9ff4d..325df65eb 100644 --- a/meilidb/src/lib.rs +++ b/meilidb/src/lib.rs @@ -1,24 +1,10 @@ #![cfg_attr(feature = "nightly", feature(test))] pub mod database; -pub mod tokenizer; mod common_words; mod sort_by_attr; pub use rocksdb; pub use self::sort_by_attr::SortByAttr; -pub use self::tokenizer::Tokenizer; pub use self::common_words::CommonWords; - -pub fn is_cjk(c: char) -> bool { - (c >= '\u{2e80}' && c <= '\u{2eff}') || - (c >= '\u{2f00}' && c <= '\u{2fdf}') || - (c >= '\u{3040}' && c <= '\u{309f}') || - (c >= '\u{30a0}' && c <= '\u{30ff}') || - (c >= '\u{3100}' && c <= '\u{312f}') || - (c >= '\u{3200}' && c <= '\u{32ff}') || - (c >= '\u{3400}' && c <= '\u{4dbf}') || - (c >= '\u{4e00}' && c <= '\u{9fff}') || - (c >= '\u{f900}' && c <= '\u{faff}') -} From 19e67dcf0b3c80ea156521813f562b2077ec6928 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 25 Feb 2019 18:34:51 +0100 Subject: [PATCH 07/44] feat: Move query splitting into the tokenizer workspace --- meilidb-core/src/query_builder.rs | 50 ++++--------------------------- meilidb-tokenizer/Cargo.toml | 2 +- meilidb-tokenizer/src/lib.rs | 28 +++++++++++++++++ 3 files changed, 35 insertions(+), 45 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index f462a52e6..6d76cfb48 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -5,7 +5,8 @@ use std::hash::Hash; use std::rc::Rc; use rayon::slice::ParallelSliceMut; -use slice_group_by::{GroupByMut, LinearStrGroupBy}; +use slice_group_by::GroupByMut; +use meilidb_tokenizer::{is_cjk, split_query_string}; use hashbrown::{HashMap, HashSet}; use fst::Streamer; use log::info; @@ -16,50 +17,11 @@ use crate::criterion::Criteria; use crate::{raw_documents_from_matches, RawDocument, Document}; use crate::{Index, Match, DocumentId}; -// query splitting must move out of this crate -pub fn is_cjk(c: char) -> bool { - (c >= '\u{2e80}' && c <= '\u{2eff}') || - (c >= '\u{2f00}' && c <= '\u{2fdf}') || - (c >= '\u{3040}' && c <= '\u{309f}') || - (c >= '\u{30a0}' && c <= '\u{30ff}') || - (c >= '\u{3100}' && c <= '\u{312f}') || - (c >= '\u{3200}' && c <= '\u{32ff}') || - (c >= '\u{3400}' && c <= '\u{4dbf}') || - (c >= '\u{4e00}' && c <= '\u{9fff}') || - (c >= '\u{f900}' && c <= '\u{faff}') -} - -#[derive(Debug, PartialEq, Eq)] -enum CharCategory { - Space, - Cjk, - Other, -} - -fn classify_char(c: char) -> CharCategory { - if c.is_whitespace() { CharCategory::Space } - else if is_cjk(c) { CharCategory::Cjk } - else { CharCategory::Other } -} - -fn is_word(s: &&str) -> bool { - !s.chars().any(char::is_whitespace) -} - -fn same_group_category(a: char, b: char) -> bool { - let ca = classify_char(a); - let cb = classify_char(b); - if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb } -} - -fn split_whitespace_automatons(query: &str) -> Vec { +fn generate_automatons(query: &str) -> Vec { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let mut groups = LinearStrGroupBy::new(query, same_group_category) - .filter(is_word) - .map(str::to_lowercase) - .peekable(); - + let mut groups = split_query_string(query).map(str::to_lowercase).peekable(); let mut automatons = Vec::new(); + while let Some(word) = groups.next() { let has_following_word = groups.peek().is_some(); let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) { @@ -122,7 +84,7 @@ impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> } fn query_all(&self, query: &str) -> Vec { - let automatons = split_whitespace_automatons(query); + let automatons = generate_automatons(query); let mut stream = { let mut op_builder = fst::map::OpBuilder::new(); diff --git a/meilidb-tokenizer/Cargo.toml b/meilidb-tokenizer/Cargo.toml index c2077533e..c8b643d09 100644 --- a/meilidb-tokenizer/Cargo.toml +++ b/meilidb-tokenizer/Cargo.toml @@ -5,4 +5,4 @@ authors = ["Clément Renault "] edition = "2018" [dependencies] - +slice-group-by = "0.2.4" diff --git a/meilidb-tokenizer/src/lib.rs b/meilidb-tokenizer/src/lib.rs index 7c4c8f915..8cdb32dc3 100644 --- a/meilidb-tokenizer/src/lib.rs +++ b/meilidb-tokenizer/src/lib.rs @@ -1,4 +1,5 @@ use std::mem; +use slice_group_by::LinearStrGroupBy; use self::Separator::*; pub fn is_cjk(c: char) -> bool { @@ -13,6 +14,33 @@ pub fn is_cjk(c: char) -> bool { (c >= '\u{f900}' && c <= '\u{faff}') } +#[derive(Debug, PartialEq, Eq)] +enum CharCategory { + Space, + Cjk, + Other, +} + +fn classify_char(c: char) -> CharCategory { + if c.is_whitespace() { CharCategory::Space } + else if is_cjk(c) { CharCategory::Cjk } + else { CharCategory::Other } +} + +fn is_word(s: &&str) -> bool { + !s.chars().any(char::is_whitespace) +} + +fn same_group_category(a: char, b: char) -> bool { + let ca = classify_char(a); + let cb = classify_char(b); + if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb } +} + +pub fn split_query_string(query: &str) -> impl Iterator { + LinearStrGroupBy::new(query, same_group_category).filter(is_word) +} + pub trait TokenizerBuilder { fn build<'a>(&self, text: &'a str) -> Box> + 'a>; } From 5d5bcf7011bc87e48f5eea802051b6c6842aec46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 25 Feb 2019 18:39:58 +0100 Subject: [PATCH 08/44] feat: Remove the FilterFunc alias type --- meilidb-core/src/lib.rs | 2 +- meilidb-core/src/query_builder.rs | 6 ++---- meilidb/src/database/view.rs | 5 ++--- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 7266aa87d..8b8606ddf 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -15,7 +15,7 @@ use slice_group_by::GroupBy; use rayon::slice::ParallelSliceMut; pub use self::index::{Index, IndexBuilder}; -pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder}; +pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder}; /// Represent an internally generated document unique identifier. /// diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 6d76cfb48..d5ec79a50 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -35,16 +35,14 @@ fn generate_automatons(query: &str) -> Vec { automatons } -pub type FilterFunc = fn(DocumentId) -> bool; - -pub struct QueryBuilder<'i, 'c, FI> { +pub struct QueryBuilder<'i, 'c, FI = fn(DocumentId) -> bool> { index: &'i Index, criteria: Criteria<'c>, searchable_attrs: Option>, filter: Option, } -impl<'i, 'c> QueryBuilder<'i, 'c, FilterFunc> { +impl<'i, 'c> QueryBuilder<'i, 'c, fn(DocumentId) -> bool> { pub fn new(index: &'i Index) -> Self { QueryBuilder::with_criteria(index, Criteria::default()) } diff --git a/meilidb/src/database/view.rs b/meilidb/src/database/view.rs index 8eb21a4c8..fcbb3fea1 100644 --- a/meilidb/src/database/view.rs +++ b/meilidb/src/database/view.rs @@ -6,8 +6,7 @@ use std::{fmt, marker}; use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions}; use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter}; use serde::de::DeserializeOwned; -use meilidb_core::{Index, QueryBuilder, FilterFunc}; -use meilidb_core::DocumentId; +use meilidb_core::{Index, QueryBuilder, DocumentId}; use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map, retrieve_config}; use crate::database::serde::deserializer::Deserializer; @@ -84,7 +83,7 @@ where D: Deref Ok(()) } - pub fn query_builder(&self) -> QueryBuilder { + pub fn query_builder(&self) -> QueryBuilder { QueryBuilder::new(self.index()) } From a745819ddf0a9dda0da557ca980f439cdab1dcb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 26 Feb 2019 12:16:10 +0100 Subject: [PATCH 09/44] feat: Simplify the Tokenizer to use the LinearStrGroupBy type --- meilidb-tokenizer/Cargo.toml | 2 +- meilidb-tokenizer/src/lib.rs | 259 +++++++++++++---------------------- 2 files changed, 96 insertions(+), 165 deletions(-) diff --git a/meilidb-tokenizer/Cargo.toml b/meilidb-tokenizer/Cargo.toml index c8b643d09..32c9429b7 100644 --- a/meilidb-tokenizer/Cargo.toml +++ b/meilidb-tokenizer/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "meilidb-tokenizer" version = "0.1.0" -authors = ["Clément Renault "] +authors = ["Kerollmops "] edition = "2018" [dependencies] diff --git a/meilidb-tokenizer/src/lib.rs b/meilidb-tokenizer/src/lib.rs index 8cdb32dc3..48bce151b 100644 --- a/meilidb-tokenizer/src/lib.rs +++ b/meilidb-tokenizer/src/lib.rs @@ -1,6 +1,5 @@ -use std::mem; -use slice_group_by::LinearStrGroupBy; -use self::Separator::*; +use slice_group_by::StrGroupBy; +use self::SeparatorCategory::*; pub fn is_cjk(c: char) -> bool { (c >= '\u{2e80}' && c <= '\u{2eff}') || @@ -14,208 +13,140 @@ pub fn is_cjk(c: char) -> bool { (c >= '\u{f900}' && c <= '\u{faff}') } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum SeparatorCategory { + Soft, + Hard, +} + +impl SeparatorCategory { + fn merge(self, other: SeparatorCategory) -> SeparatorCategory { + if let (Soft, Soft) = (self, other) { Soft } else { Hard } + } + + fn to_usize(self) -> usize { + match self { + Soft => 1, + Hard => 8, + } + } +} + +fn is_separator(c: char) -> bool { + classify_separator(c).is_some() +} + +fn classify_separator(c: char) -> Option { + match c { + ' ' | '\'' | '"' => Some(Soft), + '.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Hard), + _ => None, + } +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] enum CharCategory { - Space, + Separator(SeparatorCategory), Cjk, Other, } fn classify_char(c: char) -> CharCategory { - if c.is_whitespace() { CharCategory::Space } - else if is_cjk(c) { CharCategory::Cjk } - else { CharCategory::Other } -} - -fn is_word(s: &&str) -> bool { - !s.chars().any(char::is_whitespace) -} - -fn same_group_category(a: char, b: char) -> bool { - let ca = classify_char(a); - let cb = classify_char(b); - if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb } -} - -pub fn split_query_string(query: &str) -> impl Iterator { - LinearStrGroupBy::new(query, same_group_category).filter(is_word) -} - -pub trait TokenizerBuilder { - fn build<'a>(&self, text: &'a str) -> Box> + 'a>; -} - -pub struct DefaultBuilder; - -impl DefaultBuilder { - pub fn new() -> DefaultBuilder { - DefaultBuilder + if let Some(category) = classify_separator(c) { + CharCategory::Separator(category) + } else if is_cjk(c) { + CharCategory::Cjk + } else { + CharCategory::Other } } -#[derive(Debug, PartialEq, Eq)] +fn is_str_word(s: &str) -> bool { + !s.chars().any(is_separator) +} + +fn same_group_category(a: char, b: char) -> bool { + match (classify_char(a), classify_char(b)) { + (CharCategory::Cjk, _) | (_, CharCategory::Cjk) => false, + (CharCategory::Separator(_), CharCategory::Separator(_)) => true, + (a, b) => a == b, + } +} + +// fold the number of chars along with the index position +fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, usize) { + (n + 1, i + c.len_utf8()) +} + +pub fn split_query_string(query: &str) -> impl Iterator { + Tokenizer::new(query).map(|t| t.word) +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct Token<'a> { pub word: &'a str, pub word_index: usize, pub char_index: usize, } -impl TokenizerBuilder for DefaultBuilder { - fn build<'a>(&self, text: &'a str) -> Box> + 'a> { - Box::new(Tokenizer::new(text)) - } -} - pub struct Tokenizer<'a> { + inner: &'a str, word_index: usize, char_index: usize, - inner: &'a str, } impl<'a> Tokenizer<'a> { pub fn new(string: &str) -> Tokenizer { - let mut char_advance = 0; - let mut index_advance = 0; - for (n, (i, c)) in string.char_indices().enumerate() { - char_advance = n; - index_advance = i; - if detect_separator(c).is_none() { break } - } + // skip every separator and set `char_index` + // to the number of char trimmed + let (count, index) = string.char_indices() + .take_while(|(_, c)| is_separator(*c)) + .fold((0, 0), chars_count_index); Tokenizer { + inner: &string[index..], word_index: 0, - char_index: char_advance, - inner: &string[index_advance..], + char_index: count, } } } -#[derive(Debug, Clone, Copy)] -enum Separator { - Short, - Long, -} - -impl Separator { - fn add(self, add: Separator) -> Separator { - match (self, add) { - (_, Long) => Long, - (Short, Short) => Short, - (Long, Short) => Long, - } - } - - fn to_usize(self) -> usize { - match self { - Short => 1, - Long => 8, - } - } -} - -fn detect_separator(c: char) -> Option { - match c { - '.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Long), - ' ' | '\'' | '"' => Some(Short), - _ => None, - } -} - impl<'a> Iterator for Tokenizer<'a> { type Item = Token<'a>; fn next(&mut self) -> Option { - let mut start_word = None; - let mut distance = None; + let mut iter = self.inner.linear_group_by(same_group_category).peekable(); - for (i, c) in self.inner.char_indices() { - match detect_separator(c) { - Some(sep) => { - if let Some(start_word) = start_word { - let (prefix, tail) = self.inner.split_at(i); - let (spaces, word) = prefix.split_at(start_word); + while let (Some(string), next_string) = (iter.next(), iter.peek()) { + let (count, index) = string.char_indices().fold((0, 0), chars_count_index); - self.inner = tail; - self.char_index += spaces.chars().count(); - self.word_index += distance.map(Separator::to_usize).unwrap_or(0); - - let token = Token { - word: word, - word_index: self.word_index, - char_index: self.char_index, - }; - - self.char_index += word.chars().count(); - return Some(token) - } - - distance = Some(distance.map_or(sep, |s| s.add(sep))); - }, - None => { - // if this is a Chinese, a Japanese or a Korean character - // See - if is_cjk(c) { - match start_word { - Some(start_word) => { - let (prefix, tail) = self.inner.split_at(i); - let (spaces, word) = prefix.split_at(start_word); - - self.inner = tail; - self.char_index += spaces.chars().count(); - self.word_index += distance.map(Separator::to_usize).unwrap_or(0); - - let token = Token { - word: word, - word_index: self.word_index, - char_index: self.char_index, - }; - - self.word_index += 1; - self.char_index += word.chars().count(); - - return Some(token) - }, - None => { - let (prefix, tail) = self.inner.split_at(i + c.len_utf8()); - let (spaces, word) = prefix.split_at(i); - - self.inner = tail; - self.char_index += spaces.chars().count(); - self.word_index += distance.map(Separator::to_usize).unwrap_or(0); - - let token = Token { - word: word, - word_index: self.word_index, - char_index: self.char_index, - }; - - if tail.chars().next().and_then(detect_separator).is_none() { - self.word_index += 1; - } - self.char_index += 1; - - return Some(token) - } - } - } - - if start_word.is_none() { start_word = Some(i) } - }, + if !is_str_word(string) { + self.word_index += string.chars() + .filter_map(classify_separator) + .fold(Soft, |a, x| a.merge(x)) + .to_usize(); + self.char_index += count; + self.inner = &self.inner[index..]; + continue; } - } - - if let Some(start_word) = start_word { - let prefix = mem::replace(&mut self.inner, ""); - let (spaces, word) = prefix.split_at(start_word); let token = Token { - word: word, - word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0), - char_index: self.char_index + spaces.chars().count(), + word: string, + word_index: self.word_index, + char_index: self.char_index, }; - return Some(token) + + if next_string.filter(|s| is_str_word(s)).is_some() { + self.word_index += 1; + } + + self.char_index += count; + self.inner = &self.inner[index..]; + + return Some(token); } + self.inner = ""; None } } From 397522f277aab19ebc6de859379880c214b78486 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 26 Feb 2019 13:50:46 +0100 Subject: [PATCH 10/44] fet: Move meilidb example into the meilidb workspace --- examples/ebay/schema-example.toml | 19 ------------------- .../examples}/create-database.rs | 4 +--- .../examples}/query-database.rs | 2 +- 3 files changed, 2 insertions(+), 23 deletions(-) delete mode 100644 examples/ebay/schema-example.toml rename {examples => meilidb/examples}/create-database.rs (95%) rename {examples => meilidb/examples}/query-database.rs (99%) diff --git a/examples/ebay/schema-example.toml b/examples/ebay/schema-example.toml deleted file mode 100644 index fcf2685e9..000000000 --- a/examples/ebay/schema-example.toml +++ /dev/null @@ -1,19 +0,0 @@ -# This schema has been generated ... -# The order in which the attributes are declared is important, -# it specify the attribute xxx... - -identifier = "id" - -[attributes.id] -stored = true - -[attributes.title] -stored = true -indexed = true - -[attributes.description] -stored = true -indexed = true - -[attributes.image] -stored = true diff --git a/examples/create-database.rs b/meilidb/examples/create-database.rs similarity index 95% rename from examples/create-database.rs rename to meilidb/examples/create-database.rs index 37e252e1a..e5d9c403a 100644 --- a/examples/create-database.rs +++ b/meilidb/examples/create-database.rs @@ -13,7 +13,6 @@ use serde_derive::{Serialize, Deserialize}; use structopt::StructOpt; use meilidb::database::{Database, Schema}; -use meilidb::tokenizer::DefaultBuilder; #[derive(Debug, StructOpt)] pub struct Opt { @@ -63,7 +62,6 @@ fn index( let mut end_of_file = false; while !end_of_file { - let tokenizer_builder = DefaultBuilder::new(); let mut update = database.start_update("default")?; loop { @@ -78,7 +76,7 @@ fn index( } }; - update.update_document(&document, &tokenizer_builder, &stop_words)?; + update.update_document(&document, &stop_words)?; print!("\rindexing document {}", i); i += 1; diff --git a/examples/query-database.rs b/meilidb/examples/query-database.rs similarity index 99% rename from examples/query-database.rs rename to meilidb/examples/query-database.rs index ca6733c30..2689ffe0f 100644 --- a/examples/query-database.rs +++ b/meilidb/examples/query-database.rs @@ -11,10 +11,10 @@ use std::error::Error; use hashbrown::{HashMap, HashSet}; use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use structopt::StructOpt; +use meilidb_core::Match; use meilidb::database::schema::SchemaAttr; use meilidb::database::Database; -use meilidb::Match; #[derive(Debug, StructOpt)] pub struct Opt { From 87f9528791ef489f28939a6d53523e436364a522 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 26 Feb 2019 14:49:50 +0100 Subject: [PATCH 11/44] feat: Use the new Tokenizer --- meilidb/src/database/mod.rs | 34 +++++++------------ .../src/database/serde/indexer_serializer.rs | 11 +++--- meilidb/src/database/serde/serializer.rs | 30 +++++----------- meilidb/src/database/update/mod.rs | 6 +--- 4 files changed, 25 insertions(+), 56 deletions(-) diff --git a/meilidb/src/database/mod.rs b/meilidb/src/database/mod.rs index 08ca6cd7f..16ba148b7 100644 --- a/meilidb/src/database/mod.rs +++ b/meilidb/src/database/mod.rs @@ -430,7 +430,6 @@ mod tests { use std::error::Error; use serde_derive::{Serialize, Deserialize}; - use meilidb_tokenizer::DefaultBuilder; use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; @@ -478,11 +477,10 @@ mod tests { timestamp: 7654321, }; - let tokenizer_builder = DefaultBuilder::new(); let mut builder = database.start_update(meilidb_index_name)?; - let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?; - let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?; + let docid0 = builder.update_document(&doc0, &stop_words)?; + let docid1 = builder.update_document(&doc1, &stop_words)?; let view = database.commit_update(builder)?; @@ -549,16 +547,14 @@ mod tests { timestamp: 7654321, }; - let tokenizer_builder = DefaultBuilder::new(); - let mut builder = database.start_update(meilidb_index_name)?; - let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?; - let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?; + let docid0 = builder.update_document(&doc0, &stop_words)?; + let docid1 = builder.update_document(&doc1, &stop_words)?; database.commit_update(builder)?; let mut builder = database.start_update(meilidb_index_name)?; - let docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?; - let docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?; + let docid2 = builder.update_document(&doc2, &stop_words)?; + let docid3 = builder.update_document(&doc3, &stop_words)?; let view = database.commit_update(builder)?; let de_doc0: SimpleDoc = view.document_by_id(docid0)?; @@ -640,7 +636,6 @@ mod bench { description: String, } - let tokenizer_builder = DefaultBuilder; let mut builder = database.start_update(index_name)?; let mut rng = XorShiftRng::seed_from_u64(42); @@ -650,7 +645,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; + builder.update_document(&document, &stop_words)?; } database.commit_update(builder)?; @@ -688,7 +683,6 @@ mod bench { description: String, } - let tokenizer_builder = DefaultBuilder; let mut builder = database.start_update(index_name)?; let mut rng = XorShiftRng::seed_from_u64(42); @@ -698,7 +692,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; + builder.update_document(&document, &stop_words)?; } database.commit_update(builder)?; @@ -737,7 +731,6 @@ mod bench { description: String, } - let tokenizer_builder = DefaultBuilder; let mut builder = database.start_update(index_name)?; let mut rng = XorShiftRng::seed_from_u64(42); @@ -747,7 +740,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; + builder.update_document(&document, &stop_words)?; } database.commit_update(builder)?; @@ -785,7 +778,6 @@ mod bench { description: String, } - let tokenizer_builder = DefaultBuilder; let mut builder = database.start_update(index_name)?; let mut rng = XorShiftRng::seed_from_u64(42); @@ -795,7 +787,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; + builder.update_document(&document, &stop_words)?; } let view = database.commit_update(builder)?; @@ -833,7 +825,6 @@ mod bench { description: String, } - let tokenizer_builder = DefaultBuilder; let mut builder = database.start_update(index_name)?; let mut rng = XorShiftRng::seed_from_u64(42); @@ -843,7 +834,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; + builder.update_document(&document, &stop_words)?; } let view = database.commit_update(builder)?; @@ -882,7 +873,6 @@ mod bench { description: String, } - let tokenizer_builder = DefaultBuilder; let mut builder = database.start_update(index_name)?; let mut rng = XorShiftRng::seed_from_u64(42); @@ -892,7 +882,7 @@ mod bench { title: random_sentences(rng.gen_range(1, 8), &mut rng), description: random_sentences(rng.gen_range(20, 200), &mut rng), }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; + builder.update_document(&document, &stop_words)?; } let view = database.commit_update(builder)?; diff --git a/meilidb/src/database/serde/indexer_serializer.rs b/meilidb/src/database/serde/indexer_serializer.rs index f718111dc..ae5a0e4cb 100644 --- a/meilidb/src/database/serde/indexer_serializer.rs +++ b/meilidb/src/database/serde/indexer_serializer.rs @@ -3,23 +3,20 @@ use std::collections::HashSet; use serde::Serialize; use serde::ser; use meilidb_core::{DocumentId, DocIndex}; -use meilidb_tokenizer::{TokenizerBuilder, Token, is_cjk}; +use meilidb_tokenizer::{Tokenizer, Token, is_cjk}; use crate::database::update::DocumentUpdate; use crate::database::serde::SerializerError; use crate::database::schema::SchemaAttr; -pub struct IndexerSerializer<'a, 'b, B> { - pub tokenizer_builder: &'a B, +pub struct IndexerSerializer<'a, 'b> { pub update: &'a mut DocumentUpdate<'b>, pub document_id: DocumentId, pub attribute: SchemaAttr, pub stop_words: &'a HashSet, } -impl<'a, 'b, B> ser::Serializer for IndexerSerializer<'a, 'b, B> -where B: TokenizerBuilder -{ +impl<'a, 'b> ser::Serializer for IndexerSerializer<'a, 'b> { type Ok = (); type Error = SerializerError; type SerializeSeq = ser::Impossible; @@ -49,7 +46,7 @@ where B: TokenizerBuilder } fn serialize_str(self, v: &str) -> Result { - for token in self.tokenizer_builder.build(v) { + for token in Tokenizer::new(v) { let Token { word, word_index, char_index } = token; let document_id = self.document_id; diff --git a/meilidb/src/database/serde/serializer.rs b/meilidb/src/database/serde/serializer.rs index 8b3a05b46..e1be310ed 100644 --- a/meilidb/src/database/serde/serializer.rs +++ b/meilidb/src/database/serde/serializer.rs @@ -2,7 +2,6 @@ use std::collections::HashSet; use serde::Serialize; use serde::ser; -use meilidb_tokenizer::TokenizerBuilder; use crate::database::serde::indexer_serializer::IndexerSerializer; use crate::database::serde::key_to_string::KeyToStringSerializer; @@ -12,25 +11,22 @@ use crate::database::serde::SerializerError; use crate::database::schema::Schema; use meilidb_core::DocumentId; -pub struct Serializer<'a, 'b, B> { +pub struct Serializer<'a, 'b> { pub schema: &'a Schema, pub update: &'a mut DocumentUpdate<'b>, pub document_id: DocumentId, - pub tokenizer_builder: &'a B, pub stop_words: &'a HashSet, } -impl<'a, 'b, B> ser::Serializer for Serializer<'a, 'b, B> -where B: TokenizerBuilder -{ +impl<'a, 'b> ser::Serializer for Serializer<'a, 'b> { type Ok = (); type Error = SerializerError; type SerializeSeq = ser::Impossible; type SerializeTuple = ser::Impossible; type SerializeTupleStruct = ser::Impossible; type SerializeTupleVariant = ser::Impossible; - type SerializeMap = MapSerializer<'a, 'b, B>; - type SerializeStruct = StructSerializer<'a, 'b, B>; + type SerializeMap = MapSerializer<'a, 'b>; + type SerializeStruct = StructSerializer<'a, 'b>; type SerializeStructVariant = ser::Impossible; forward_to_unserializable_type! { @@ -142,7 +138,6 @@ where B: TokenizerBuilder schema: self.schema, document_id: self.document_id, update: self.update, - tokenizer_builder: self.tokenizer_builder, stop_words: self.stop_words, current_key_name: None, }) @@ -158,7 +153,6 @@ where B: TokenizerBuilder schema: self.schema, document_id: self.document_id, update: self.update, - tokenizer_builder: self.tokenizer_builder, stop_words: self.stop_words, }) } @@ -175,18 +169,15 @@ where B: TokenizerBuilder } } -pub struct MapSerializer<'a, 'b, B> { +pub struct MapSerializer<'a, 'b> { pub schema: &'a Schema, pub document_id: DocumentId, pub update: &'a mut DocumentUpdate<'b>, - pub tokenizer_builder: &'a B, pub stop_words: &'a HashSet, pub current_key_name: Option, } -impl<'a, 'b, B> ser::SerializeMap for MapSerializer<'a, 'b, B> -where B: TokenizerBuilder -{ +impl<'a, 'b> ser::SerializeMap for MapSerializer<'a, 'b> { type Ok = (); type Error = SerializerError; @@ -223,7 +214,6 @@ where B: TokenizerBuilder if props.is_indexed() { let serializer = IndexerSerializer { update: self.update, - tokenizer_builder: self.tokenizer_builder, document_id: self.document_id, attribute: attr, stop_words: self.stop_words, @@ -244,17 +234,14 @@ where B: TokenizerBuilder } } -pub struct StructSerializer<'a, 'b, B> { +pub struct StructSerializer<'a, 'b> { pub schema: &'a Schema, pub document_id: DocumentId, pub update: &'a mut DocumentUpdate<'b>, - pub tokenizer_builder: &'a B, pub stop_words: &'a HashSet, } -impl<'a, 'b, B> ser::SerializeStruct for StructSerializer<'a, 'b, B> -where B: TokenizerBuilder -{ +impl<'a, 'b> ser::SerializeStruct for StructSerializer<'a, 'b> { type Ok = (); type Error = SerializerError; @@ -274,7 +261,6 @@ where B: TokenizerBuilder if props.is_indexed() { let serializer = IndexerSerializer { update: self.update, - tokenizer_builder: self.tokenizer_builder, document_id: self.document_id, attribute: attr, stop_words: self.stop_words, diff --git a/meilidb/src/database/update/mod.rs b/meilidb/src/database/update/mod.rs index 720b7aaf3..f34cf6a8e 100644 --- a/meilidb/src/database/update/mod.rs +++ b/meilidb/src/database/update/mod.rs @@ -8,7 +8,6 @@ use serde::Serialize; use meilidb_core::write_to_bytes::WriteToBytes; use meilidb_core::data::DocIds; use meilidb_core::{IndexBuilder, DocumentId, DocIndex}; -use meilidb_tokenizer::TokenizerBuilder; use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; use crate::database::serde::serializer::Serializer; @@ -36,21 +35,18 @@ impl Update { Update { schema, raw_builder: RawUpdateBuilder::new() } } - pub fn update_document( + pub fn update_document( &mut self, document: T, - tokenizer_builder: &B, stop_words: &HashSet, ) -> Result where T: Serialize, - B: TokenizerBuilder, { let document_id = self.schema.document_id(&document)?; let serializer = Serializer { schema: &self.schema, document_id: document_id, - tokenizer_builder: tokenizer_builder, update: &mut self.raw_builder.document_update(document_id)?, stop_words: stop_words, }; From e56106cbdca66ab7a23d6f0ec7d31358df4213be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 16 Mar 2019 10:46:34 +0100 Subject: [PATCH 12/44] chore: Update the toml dependency --- meilidb/Cargo.toml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml index 8a042168c..9f12a3a2f 100644 --- a/meilidb/Cargo.toml +++ b/meilidb/Cargo.toml @@ -21,13 +21,9 @@ serde_derive = "1.0.88" serde_json = { version = "1.0.38", features = ["preserve_order"] } size_format = "1.0.2" slice-group-by = "0.2.4" +toml = { version = "0.5.0", features = ["preserve_order"] } unidecode = "0.3.0" -[dependencies.toml] -git = "https://github.com/Kerollmops/toml-rs.git" -features = ["preserve_order"] -rev = "0372ba6" - [dependencies.rocksdb] git = "https://github.com/pingcap/rust-rocksdb.git" rev = "306e201" From acede0f3e88cbcca7572197b45ba5cab65abc13e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 16 Mar 2019 10:48:04 +0100 Subject: [PATCH 13/44] fix: Correctly assert the DocIndex memory size --- meilidb-core/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 8b8606ddf..11c734e37 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -292,6 +292,6 @@ mod tests { #[test] fn docindex_mem_size() { - assert_eq!(mem::size_of::(), 24); + assert_eq!(mem::size_of::(), 16); } } From c6bb2b6f9cbe3a5a505491736c0e63a868fe5a42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 16 Mar 2019 12:19:36 +0100 Subject: [PATCH 14/44] chore: Make the debug symbols available for release binaries --- Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 139e8b472..1d97a68c7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,3 +4,6 @@ members = [ "meilidb-core", "meilidb-tokenizer", ] + +[profile.release] +debug = true From abf7191eec965e569317ef7b6ab4b38eef556216 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 18 Mar 2019 14:42:59 +0100 Subject: [PATCH 15/44] feat: Make the Tokenizer able to support tokenizing sequences --- meilidb-tokenizer/src/lib.rs | 66 ++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/meilidb-tokenizer/src/lib.rs b/meilidb-tokenizer/src/lib.rs index 48bce151b..88e389a46 100644 --- a/meilidb-tokenizer/src/lib.rs +++ b/meilidb-tokenizer/src/lib.rs @@ -1,3 +1,4 @@ +use std::iter::Peekable; use slice_group_by::StrGroupBy; use self::SeparatorCategory::*; @@ -151,6 +152,71 @@ impl<'a> Iterator for Tokenizer<'a> { } } +pub struct SeqTokenizer<'a, I> +where I: Iterator, +{ + inner: I, + current: Option>>, + word_offset: usize, + char_offset: usize, +} + +impl<'a, I> SeqTokenizer<'a, I> +where I: Iterator, +{ + pub fn new(mut iter: I) -> SeqTokenizer<'a, I> { + let current = iter.next().map(|s| Tokenizer::new(s).peekable()); + SeqTokenizer { + inner: iter, + current: current, + word_offset: 0, + char_offset: 0, + } + } +} + +impl<'a, I> Iterator for SeqTokenizer<'a, I> +where I: Iterator, +{ + type Item = Token<'a>; + + fn next(&mut self) -> Option { + match &mut self.current { + Some(current) => { + match current.next() { + Some(token) => { + // we must apply the word and char offsets + // to the token before returning it + let token = Token { + word: token.word, + word_index: token.word_index + self.word_offset, + char_index: token.char_index + self.char_offset, + }; + + // if this is the last iteration on this text + // we must save the offsets for next texts + if current.peek().is_none() { + let hard_space = SeparatorCategory::Hard.to_usize(); + self.word_offset = token.word_index + hard_space; + self.char_offset = token.char_index + hard_space; + } + + Some(token) + }, + None => { + // no more words in this text we must + // start tokenizing the next text + self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable()); + self.next() + }, + } + }, + // no more texts available + None => None, + } + } +} + #[cfg(test)] mod tests { use super::*; From 77405cc10375cecf18d367cabdf8f3039f6c6ff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 29 Mar 2019 16:58:55 +0100 Subject: [PATCH 16/44] chore: Remove the database module from meilidb --- meilidb/Cargo.toml | 32 +- meilidb/src/database/config.rs | 46 - meilidb/src/database/document_key.rs | 149 --- meilidb/src/database/mod.rs | 899 ------------------ meilidb/src/database/number.rs | 98 -- meilidb/src/database/schema.rs | 319 ------- meilidb/src/database/serde/deserializer.rs | 186 ---- meilidb/src/database/serde/find_id.rs | 243 ----- .../src/database/serde/indexer_serializer.rs | 190 ---- meilidb/src/database/serde/key_to_string.rs | 146 --- meilidb/src/database/serde/mod.rs | 65 -- meilidb/src/database/serde/serializer.rs | 282 ------ meilidb/src/database/serde/value_to_number.rs | 176 ---- meilidb/src/database/update/index_event.rs | 55 -- meilidb/src/database/update/mod.rs | 234 ----- .../src/database/update/ranked_map_event.rs | 58 -- meilidb/src/database/view.rs | 199 ---- meilidb/src/lib.rs | 3 - 18 files changed, 3 insertions(+), 3377 deletions(-) delete mode 100644 meilidb/src/database/config.rs delete mode 100644 meilidb/src/database/document_key.rs delete mode 100644 meilidb/src/database/mod.rs delete mode 100644 meilidb/src/database/number.rs delete mode 100644 meilidb/src/database/schema.rs delete mode 100644 meilidb/src/database/serde/deserializer.rs delete mode 100644 meilidb/src/database/serde/find_id.rs delete mode 100644 meilidb/src/database/serde/indexer_serializer.rs delete mode 100644 meilidb/src/database/serde/key_to_string.rs delete mode 100644 meilidb/src/database/serde/mod.rs delete mode 100644 meilidb/src/database/serde/serializer.rs delete mode 100644 meilidb/src/database/serde/value_to_number.rs delete mode 100644 meilidb/src/database/update/index_event.rs delete mode 100644 meilidb/src/database/update/mod.rs delete mode 100644 meilidb/src/database/update/ranked_map_event.rs delete mode 100644 meilidb/src/database/view.rs diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml index 9f12a3a2f..8dc6f0db5 100644 --- a/meilidb/Cargo.toml +++ b/meilidb/Cargo.toml @@ -5,39 +5,13 @@ version = "0.3.1" authors = ["Kerollmops "] [dependencies] -arc-swap = "0.3.7" -bincode = "1.1.2" -byteorder = "1.3.1" -fst = "0.3.3" -hashbrown = { version = "0.1.8", features = ["serde"] } -linked-hash-map = { version = "0.5.1", features = ["serde_impl"] } -lockfree = "0.5.1" -log = "0.4.6" meilidb-core = { path = "../meilidb-core", version = "0.1.0" } meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } -sdset = "0.3.1" -serde = "1.0.88" -serde_derive = "1.0.88" -serde_json = { version = "1.0.38", features = ["preserve_order"] } -size_format = "1.0.2" -slice-group-by = "0.2.4" -toml = { version = "0.5.0", features = ["preserve_order"] } -unidecode = "0.3.0" - -[dependencies.rocksdb] -git = "https://github.com/pingcap/rust-rocksdb.git" -rev = "306e201" [features] -default = ["simd"] -i128 = ["bincode/i128", "meilidb-core/i128"] -portable = ["rocksdb/portable"] -simd = ["rocksdb/sse"] -nightly = [ - "hashbrown/nightly", - "slice-group-by/nightly", - "meilidb-core/nightly" -] +default = [] +i128 = ["meilidb-core/i128"] +nightly = ["meilidb-core/nightly"] [dev-dependencies] csv = "1.0.5" diff --git a/meilidb/src/database/config.rs b/meilidb/src/database/config.rs deleted file mode 100644 index 491cdba93..000000000 --- a/meilidb/src/database/config.rs +++ /dev/null @@ -1,46 +0,0 @@ -use std::collections::{HashSet, HashMap}; -use serde_derive::{Serialize, Deserialize}; - -#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "lowercase")] -pub enum RankingOrdering { - Asc, - Dsc -} - -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct AccessToken { - pub read_key: String, - pub write_key: String, - pub admin_key: String, -} - - -#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct Config { - pub stop_words: Option>, - pub ranking_order: Option>, - pub distinct_field: Option, - pub ranking_rules: Option>, - pub access_token: Option, -} - -impl Config { - pub fn update_with(&mut self, new: Config) { - if let Some(stop_words) = new.stop_words { - self.stop_words = Some(stop_words); - }; - if let Some(ranking_order) = new.ranking_order { - self.ranking_order = Some(ranking_order); - }; - if let Some(distinct_field) = new.distinct_field { - self.distinct_field = Some(distinct_field); - }; - if let Some(ranking_rules) = new.ranking_rules { - self.ranking_rules = Some(ranking_rules); - }; - if let Some(access_token) = new.access_token { - self.access_token = Some(access_token); - }; - } -} diff --git a/meilidb/src/database/document_key.rs b/meilidb/src/database/document_key.rs deleted file mode 100644 index d6b9865ef..000000000 --- a/meilidb/src/database/document_key.rs +++ /dev/null @@ -1,149 +0,0 @@ -use std::io::{Cursor, Read, Write}; -use std::mem::size_of; -use std::fmt; - -use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt}; - -use crate::database::schema::SchemaAttr; -use meilidb_core::DocumentId; - -const DOC_KEY_LEN: usize = 4 + size_of::(); -const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::(); - -#[derive(Copy, Clone)] -pub struct DocumentKey([u8; DOC_KEY_LEN]); - -impl DocumentKey { - pub fn new(id: DocumentId) -> DocumentKey { - let mut buffer = [0; DOC_KEY_LEN]; - - let mut wtr = Cursor::new(&mut buffer[..]); - wtr.write_all(b"doc-").unwrap(); - wtr.write_u64::(id.0).unwrap(); - - DocumentKey(buffer) - } - - pub fn from_bytes(mut bytes: &[u8]) -> DocumentKey { - assert!(bytes.len() >= DOC_KEY_LEN); - assert_eq!(&bytes[..4], b"doc-"); - - let mut buffer = [0; DOC_KEY_LEN]; - bytes.read_exact(&mut buffer).unwrap(); - - DocumentKey(buffer) - } - - pub fn with_attribute(&self, attr: SchemaAttr) -> DocumentKeyAttr { - DocumentKeyAttr::new(self.document_id(), attr) - } - - pub fn with_attribute_min(&self) -> DocumentKeyAttr { - DocumentKeyAttr::new(self.document_id(), SchemaAttr::min()) - } - - pub fn with_attribute_max(&self) -> DocumentKeyAttr { - DocumentKeyAttr::new(self.document_id(), SchemaAttr::max()) - } - - pub fn document_id(&self) -> DocumentId { - let id = (&self.0[4..]).read_u64::().unwrap(); - DocumentId(id) - } -} - -impl AsRef<[u8]> for DocumentKey { - fn as_ref(&self) -> &[u8] { - &self.0 - } -} - -impl fmt::Debug for DocumentKey { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("DocumentKey") - .field("document_id", &self.document_id()) - .finish() - } -} - -#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] -pub struct DocumentKeyAttr([u8; DOC_KEY_ATTR_LEN]); - -impl DocumentKeyAttr { - pub fn new(id: DocumentId, attr: SchemaAttr) -> DocumentKeyAttr { - let mut buffer = [0; DOC_KEY_ATTR_LEN]; - let DocumentKey(raw_key) = DocumentKey::new(id); - - let mut wtr = Cursor::new(&mut buffer[..]); - wtr.write_all(&raw_key).unwrap(); - wtr.write_all(b"-").unwrap(); - wtr.write_u16::(attr.0).unwrap(); - - DocumentKeyAttr(buffer) - } - - pub fn with_attribute_min(id: DocumentId) -> DocumentKeyAttr { - DocumentKeyAttr::new(id, SchemaAttr::min()) - } - - pub fn with_attribute_max(id: DocumentId) -> DocumentKeyAttr { - DocumentKeyAttr::new(id, SchemaAttr::max()) - } - - pub fn from_bytes(mut bytes: &[u8]) -> DocumentKeyAttr { - assert!(bytes.len() >= DOC_KEY_ATTR_LEN); - assert_eq!(&bytes[..4], b"doc-"); - - let mut buffer = [0; DOC_KEY_ATTR_LEN]; - bytes.read_exact(&mut buffer).unwrap(); - - DocumentKeyAttr(buffer) - } - - pub fn document_id(&self) -> DocumentId { - let id = (&self.0[4..]).read_u64::().unwrap(); - DocumentId(id) - } - - pub fn attribute(&self) -> SchemaAttr { - let offset = 4 + size_of::() + 1; - let value = (&self.0[offset..]).read_u16::().unwrap(); - SchemaAttr::new(value) - } - - pub fn into_document_key(self) -> DocumentKey { - DocumentKey::new(self.document_id()) - } -} - -impl AsRef<[u8]> for DocumentKeyAttr { - fn as_ref(&self) -> &[u8] { - &self.0 - } -} - -impl fmt::Debug for DocumentKeyAttr { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("DocumentKeyAttr") - .field("document_id", &self.document_id()) - .field("attribute", &self.attribute().0) - .finish() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn keep_as_ref_order() { - for (a, b) in (0..).zip(1..).take(u16::max_value() as usize - 1) { - let id = DocumentId(0); - let a = DocumentKeyAttr::new(id, SchemaAttr(a)); - let b = DocumentKeyAttr::new(id, SchemaAttr(b)); - - assert!(a < b); - assert!(a.as_ref() < b.as_ref()); - } - } -} diff --git a/meilidb/src/database/mod.rs b/meilidb/src/database/mod.rs deleted file mode 100644 index 16ba148b7..000000000 --- a/meilidb/src/database/mod.rs +++ /dev/null @@ -1,899 +0,0 @@ -use std::time::Instant; -use std::error::Error; -use std::ffi::OsStr; -use std::sync::Arc; -use std::fs; -use std::path::{Path, PathBuf}; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::ops::{Deref, DerefMut}; - -use rocksdb::rocksdb_options::{DBOptions, ColumnFamilyOptions}; -use rocksdb::rocksdb::{Writable, Snapshot}; -use rocksdb::{DB, MergeOperands}; -use size_format::SizeFormatterBinary; -use arc_swap::ArcSwap; -use lockfree::map::Map; -use hashbrown::HashMap; -use log::{info, error, warn}; - -use crate::database::schema::SchemaAttr; -use meilidb_core::shared_data_cursor::FromSharedDataCursor; -use meilidb_core::write_to_bytes::WriteToBytes; -use meilidb_core::{Index, DocumentId}; - -use self::update::{ReadIndexEvent, ReadRankedMapEvent}; - -pub use self::config::Config; -pub use self::document_key::{DocumentKey, DocumentKeyAttr}; -pub use self::view::{DatabaseView, DocumentIter}; -pub use self::update::Update; -pub use self::serde::SerializerError; -pub use self::schema::Schema; -pub use self::number::{Number, ParseNumberError}; - -pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>; - -const DATA_INDEX: &[u8] = b"data-index"; -const DATA_RANKED_MAP: &[u8] = b"data-ranked-map"; -const DATA_SCHEMA: &[u8] = b"data-schema"; -const CONFIG: &[u8] = b"config"; - -pub mod config; -pub mod schema; -mod number; -mod document_key; -mod serde; -mod update; -mod view; - -fn retrieve_data_schema(snapshot: &Snapshot) -> Result> -where D: Deref -{ - match snapshot.get(DATA_SCHEMA)? { - Some(vector) => Ok(Schema::read_from_bin(&*vector)?), - None => Err(String::from("BUG: no schema found in the database").into()), - } -} - -fn retrieve_data_index(snapshot: &Snapshot) -> Result> -where D: Deref -{ - let start = Instant::now(); - let vector = snapshot.get(DATA_INDEX)?; - info!("loading index from kv-store took {:.2?}", start.elapsed()); - - match vector { - Some(vector) => { - let start = Instant::now(); - - let bytes = vector.as_ref().to_vec(); - info!("index size is {}B", SizeFormatterBinary::new(bytes.len() as u64)); - - let event = ReadIndexEvent::from_bytes(bytes)?; - let index = event.updated_documents().expect("BUG: invalid event deserialized"); - - info!("loading index from bytes took {:.2?}", start.elapsed()); - - Ok(index) - }, - None => Ok(Index::default()), - } -} - -fn retrieve_data_ranked_map(snapshot: &Snapshot) -> Result> -where D: Deref, -{ - let start = Instant::now(); - let vector = snapshot.get(DATA_RANKED_MAP)?; - info!("loading ranked map from kv-store took {:.2?}", start.elapsed()); - - match vector { - Some(vector) => { - let start = Instant::now(); - - let bytes = vector.as_ref().to_vec(); - info!("ranked map size is {}B", SizeFormatterBinary::new(bytes.len() as u64)); - - let event = ReadRankedMapEvent::from_bytes(bytes)?; - let ranked_map = event.updated_documents().expect("BUG: invalid event deserialized"); - - info!("loading ranked map from bytes took {:.2?}", start.elapsed()); - - Ok(ranked_map) - }, - None => Ok(RankedMap::new()), - } -} - -fn retrieve_config(snapshot: &Snapshot) -> Result> -where D: Deref, -{ - match snapshot.get(CONFIG)? { - Some(vector) => Ok(bincode::deserialize(&*vector)?), - None => Ok(Config::default()), - } -} - -fn merge_indexes(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec { - use self::update::ReadIndexEvent::{self, *}; - use self::update::WriteIndexEvent; - - let mut index = Index::default(); - for bytes in existing.into_iter().chain(operands) { - match ReadIndexEvent::from_bytes(bytes.to_vec()).unwrap() { - RemovedDocuments(d) => index = index.remove_documents(d.as_ref()), - UpdatedDocuments(i) => index = index.union(&i), - } - } - - WriteIndexEvent::UpdatedDocuments(&index).into_bytes() -} - -fn merge_ranked_maps(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec { - use self::update::ReadRankedMapEvent::{self, *}; - use self::update::WriteRankedMapEvent; - - let mut ranked_map = RankedMap::default(); - for bytes in existing.into_iter().chain(operands) { - match ReadRankedMapEvent::from_bytes(bytes.to_vec()).unwrap() { - RemovedDocuments(d) => ranked_map.retain(|(k, _), _| !d.as_ref().binary_search(k).is_ok()), - UpdatedDocuments(i) => ranked_map.extend(i), - } - } - - WriteRankedMapEvent::UpdatedDocuments(&ranked_map).into_bytes() -} - -fn merge_operator(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec { - match key { - DATA_INDEX => merge_indexes(existing, operands), - DATA_RANKED_MAP => merge_ranked_maps(existing, operands), - key => panic!("The merge operator does not support merging {:?}", key), - } -} - -pub struct IndexUpdate { - index: String, - update: Update, -} - -impl Deref for IndexUpdate { - type Target = Update; - - fn deref(&self) -> &Update { - &self.update - } -} - -impl DerefMut for IndexUpdate { - fn deref_mut(&mut self) -> &mut Update { - &mut self.update - } -} - -struct DatabaseIndex { - db: Arc, - - // This view is updated each time the DB ingests an update. - view: ArcSwap>>, - - // The path of the mdb folder stored on disk. - path: PathBuf, - - // must_die false by default, must be set as true when the Index is dropped. - // It is used to erase the folder saved on disk when the user request to delete an index. - must_die: AtomicBool, -} - -impl DatabaseIndex { - fn create>(path: P, schema: &Schema) -> Result> { - let path = path.as_ref(); - if path.exists() { - return Err(format!("File already exists at path: {}, cannot create database.", - path.display()).into()) - } - - let path_lossy = path.to_string_lossy(); - let mut opts = DBOptions::new(); - opts.create_if_missing(true); - // opts.error_if_exists(true); // FIXME pull request that - - let mut cf_opts = ColumnFamilyOptions::new(); - cf_opts.add_merge_operator("data merge operator", merge_operator); - - let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?; - - let mut schema_bytes = Vec::new(); - schema.write_to_bin(&mut schema_bytes)?; - db.put(DATA_SCHEMA, &schema_bytes)?; - - let db = Arc::new(db); - let snapshot = Snapshot::new(db.clone()); - let view = ArcSwap::new(Arc::new(DatabaseView::new(snapshot)?)); - - Ok(DatabaseIndex { - db: db, - view: view, - path: path.to_path_buf(), - must_die: AtomicBool::new(false) - }) - } - - fn open>(path: P) -> Result> { - let path_lossy = path.as_ref().to_string_lossy(); - - let mut opts = DBOptions::new(); - opts.create_if_missing(false); - - let mut cf_opts = ColumnFamilyOptions::new(); - cf_opts.add_merge_operator("data merge operator", merge_operator); - - let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?; - - // FIXME create a generic function to do that ! - let _schema = match db.get(DATA_SCHEMA)? { - Some(value) => Schema::read_from_bin(&*value)?, - None => return Err(String::from("Database does not contain a schema").into()), - }; - - let db = Arc::new(db); - let snapshot = Snapshot::new(db.clone()); - let view = ArcSwap::new(Arc::new(DatabaseView::new(snapshot)?)); - - Ok(DatabaseIndex { - db: db, - view: view, - path: path.as_ref().to_path_buf(), - must_die: AtomicBool::new(false) - }) - } - - fn must_die(&self) { - self.must_die.store(true, Ordering::Relaxed) - } - - fn start_update(&self) -> Result> { - let schema = match self.db.get(DATA_SCHEMA)? { - Some(value) => Schema::read_from_bin(&*value)?, - None => panic!("Database does not contain a schema"), - }; - - Ok(Update::new(schema)) - } - - fn commit_update(&self, update: Update) -> Result>>, Box> { - let batch = update.build()?; - self.db.write(batch)?; - self.db.compact_range(None, None); - self.db.flush(true)?; - - let snapshot = Snapshot::new(self.db.clone()); - let view = Arc::new(DatabaseView::new(snapshot)?); - self.view.store(view.clone()); - - Ok(view) - } - - fn view(&self) -> Arc>> { - self.view.load() - } - - fn get_config(&self) -> Config { - self.view().config().clone() - } - - fn update_config(&self, config: Config) -> Result>>, Box>{ - let data = bincode::serialize(&config)?; - self.db.put(CONFIG, &data)?; - - let snapshot = Snapshot::new(self.db.clone()); - let view = Arc::new(DatabaseView::new(snapshot)?); - self.view.store(view.clone()); - - Ok(view) - } - - fn path(&self) -> &Path { - self.path.as_path() - } -} - -impl Drop for DatabaseIndex { - fn drop(&mut self) { - if self.must_die.load(Ordering::Relaxed) { - if let Err(err) = fs::remove_dir_all(&self.path) { - error!("Impossible to remove mdb when Database is dropped; {}", err); - } - } - } -} - -pub struct Database { - indexes: Map>, - path: PathBuf, -} - -impl Database { - pub fn create>(path: P) -> Result> { - Ok(Database { - indexes: Map::new(), - path: path.as_ref().to_path_buf(), - }) - } - - pub fn open>(path: P) -> Result> { - let entries = fs::read_dir(&path)?; - - let indexes = Map::new(); - for entry in entries { - let path = match entry { - Ok(p) => p.path(), - Err(err) => { - warn!("Impossible to retrieve the path from an entry; {}", err); - continue - } - }; - - let name = match path.file_stem().and_then(OsStr::to_str) { - Some(name) => name.to_owned(), - None => continue - }; - - let db = match DatabaseIndex::open(path.clone()) { - Ok(db) => db, - Err(err) => { - warn!("Impossible to open the database; {}", err); - continue - } - }; - - info!("Load database {}", name); - indexes.insert(name, Arc::new(db)); - } - - Ok(Database { - indexes: indexes, - path: path.as_ref().to_path_buf(), - }) - } - - pub fn create_index(&self, name: &str, schema: &Schema) -> Result<(), Box> { - let index_path = self.path.join(name); - - if index_path.exists() { - return Err("Index already exists".into()); - } - - let index = DatabaseIndex::create(index_path, schema)?; - self.indexes.insert(name.to_owned(), Arc::new(index)); - - Ok(()) - } - - pub fn delete_index(&self, name: &str) -> Result<(), Box> { - let index_guard = self.indexes.remove(name).ok_or("Index not found")?; - index_guard.val().must_die(); - - Ok(()) - } - - pub fn list_indexes(&self) -> Vec { - self.indexes.iter().map(|g| g.key().clone()).collect() - } - - pub fn start_update(&self, index: &str) -> Result> { - let index_guard = self.indexes.get(index).ok_or("Index not found")?; - let update = index_guard.val().start_update()?; - - Ok(IndexUpdate { index: index.to_owned(), update }) - } - - pub fn commit_update(&self, update: IndexUpdate)-> Result>>, Box> { - let index_guard = self.indexes.get(&update.index).ok_or("Index not found")?; - - index_guard.val().commit_update(update.update) - } - - pub fn view(&self, index: &str) -> Result>>, Box> { - let index_guard = self.indexes.get(index).ok_or("Index not found")?; - - Ok(index_guard.val().view()) - } - - pub fn get_config(&self, index: &str) -> Result> { - let index_guard = self.indexes.get(index).ok_or("Index not found")?; - - Ok(index_guard.val().get_config()) - } - - pub fn update_config(&self, index: &str, config: Config) -> Result>>, Box>{ - let index_guard = self.indexes.get(index).ok_or("Index not found")?; - - Ok(index_guard.val().update_config(config)?) - } - - pub fn path(&self) -> &Path { - self.path.as_path() - } - - pub fn index_path(&self, index: &str) -> Result> { - let index_guard = self.indexes.get(index).ok_or("Index not found")?; - let path = index_guard.val().path(); - Ok(path.to_path_buf()) - } - -} - -#[cfg(test)] -mod tests { - use std::collections::HashSet; - use std::error::Error; - - use serde_derive::{Serialize, Deserialize}; - - use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; - - use super::*; - - #[test] - fn ingest_one_easy_update() -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let meilidb_path = dir.path().join("meilidb.mdb"); - let meilidb_index_name = "default"; - - #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] - struct SimpleDoc { - id: u64, - title: String, - description: String, - timestamp: u64, - } - - let schema = { - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("id", STORED); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - builder.new_attribute("timestamp", STORED); - builder.build() - }; - - let database = Database::create(&meilidb_path)?; - - database.create_index(meilidb_index_name, &schema)?; - - let doc0 = SimpleDoc { - id: 0, - title: String::from("I am a title"), - description: String::from("I am a description"), - timestamp: 1234567, - }; - let doc1 = SimpleDoc { - id: 1, - title: String::from("I am the second title"), - description: String::from("I am the second description"), - timestamp: 7654321, - }; - - let mut builder = database.start_update(meilidb_index_name)?; - - let docid0 = builder.update_document(&doc0, &stop_words)?; - let docid1 = builder.update_document(&doc1, &stop_words)?; - - let view = database.commit_update(builder)?; - - let de_doc0: SimpleDoc = view.document_by_id(docid0)?; - let de_doc1: SimpleDoc = view.document_by_id(docid1)?; - - assert_eq!(doc0, de_doc0); - assert_eq!(doc1, de_doc1); - - Ok(dir.close()?) - } - - #[test] - fn ingest_two_easy_updates() -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let meilidb_path = dir.path().join("meilidb.mdb"); - let meilidb_index_name = "default"; - - #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] - struct SimpleDoc { - id: u64, - title: String, - description: String, - timestamp: u64, - } - - let schema = { - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("id", STORED); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - builder.new_attribute("timestamp", STORED); - builder.build() - }; - - let database = Database::create(&meilidb_path)?; - - database.create_index(meilidb_index_name, &schema)?; - - let doc0 = SimpleDoc { - id: 0, - title: String::from("I am a title"), - description: String::from("I am a description"), - timestamp: 1234567, - }; - let doc1 = SimpleDoc { - id: 1, - title: String::from("I am the second title"), - description: String::from("I am the second description"), - timestamp: 7654321, - }; - let doc2 = SimpleDoc { - id: 2, - title: String::from("I am the third title"), - description: String::from("I am the third description"), - timestamp: 7654321, - }; - let doc3 = SimpleDoc { - id: 3, - title: String::from("I am the fourth title"), - description: String::from("I am the fourth description"), - timestamp: 7654321, - }; - - let mut builder = database.start_update(meilidb_index_name)?; - let docid0 = builder.update_document(&doc0, &stop_words)?; - let docid1 = builder.update_document(&doc1, &stop_words)?; - database.commit_update(builder)?; - - let mut builder = database.start_update(meilidb_index_name)?; - let docid2 = builder.update_document(&doc2, &stop_words)?; - let docid3 = builder.update_document(&doc3, &stop_words)?; - let view = database.commit_update(builder)?; - - let de_doc0: SimpleDoc = view.document_by_id(docid0)?; - let de_doc1: SimpleDoc = view.document_by_id(docid1)?; - - assert_eq!(doc0, de_doc0); - assert_eq!(doc1, de_doc1); - - let de_doc2: SimpleDoc = view.document_by_id(docid2)?; - let de_doc3: SimpleDoc = view.document_by_id(docid3)?; - - assert_eq!(doc2, de_doc2); - assert_eq!(doc3, de_doc3); - - Ok(dir.close()?) - } -} - -#[cfg(all(feature = "nightly", test))] -mod bench { - extern crate test; - - use std::collections::HashSet; - use std::error::Error; - use std::iter::repeat_with; - use self::test::Bencher; - - use rand::distributions::Alphanumeric; - use rand_xorshift::XorShiftRng; - use rand::{Rng, SeedableRng}; - use serde_derive::Serialize; - use rand::seq::SliceRandom; - - use crate::tokenizer::DefaultBuilder; - use crate::database::schema::*; - - use super::*; - - fn random_sentences(number: usize, rng: &mut R) -> String { - let mut words = String::new(); - - for i in 0..number { - let word_len = rng.gen_range(1, 12); - let iter = repeat_with(|| rng.sample(Alphanumeric)).take(word_len); - words.extend(iter); - - if i == number - 1 { // last word - let final_ = [".", "?", "!", "..."].choose(rng).cloned(); - words.extend(final_); - } else { - let middle = [",", ", "].choose(rng).cloned(); - words.extend(middle); - } - } - - words - } - - #[bench] - fn open_little_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..300 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &stop_words)?; - } - - database.commit_update(builder)?; - - drop(database); - - bench.iter(|| { - let database = Database::open(db_path.clone()).unwrap(); - test::black_box(|| database); - }); - - Ok(()) - } - - #[bench] - fn open_medium_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..3000 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &stop_words)?; - } - - database.commit_update(builder)?; - - drop(database); - - bench.iter(|| { - let database = Database::open(db_path.clone()).unwrap(); - test::black_box(|| database); - }); - - Ok(()) - } - - #[bench] - #[ignore] - fn open_big_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..30_000 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &stop_words)?; - } - - database.commit_update(builder)?; - - drop(database); - - bench.iter(|| { - let database = Database::open(db_path.clone()).unwrap(); - test::black_box(|| database); - }); - - Ok(()) - } - - #[bench] - fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..300 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &stop_words)?; - } - - let view = database.commit_update(builder)?; - - bench.iter(|| { - for q in &["a", "b", "c", "d", "e"] { - let documents = view.query_builder().query(q, 0..20); - test::black_box(|| documents); - } - }); - - Ok(()) - } - - #[bench] - fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..3000 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &stop_words)?; - } - - let view = database.commit_update(builder)?; - - bench.iter(|| { - for q in &["a", "b", "c", "d", "e"] { - let documents = view.query_builder().query(q, 0..20); - test::black_box(|| documents); - } - }); - - Ok(()) - } - - #[bench] - #[ignore] - fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..30_000 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &stop_words)?; - } - - let view = database.commit_update(builder)?; - - bench.iter(|| { - for q in &["a", "b", "c", "d", "e"] { - let documents = view.query_builder().query(q, 0..20); - test::black_box(|| documents); - } - }); - - Ok(()) - } -} diff --git a/meilidb/src/database/number.rs b/meilidb/src/database/number.rs deleted file mode 100644 index b2c4c9a88..000000000 --- a/meilidb/src/database/number.rs +++ /dev/null @@ -1,98 +0,0 @@ -use std::cmp::Ordering; -use std::str::FromStr; -use std::fmt; - -use serde_derive::{Serialize, Deserialize}; - -#[derive(Serialize, Deserialize)] -#[derive(Debug, Copy, Clone)] -pub enum Number { - Unsigned(u64), - Signed(i64), - Float(f64), -} - -impl FromStr for Number { - type Err = ParseNumberError; - - fn from_str(s: &str) -> Result { - if let Ok(unsigned) = u64::from_str(s) { - return Ok(Number::Unsigned(unsigned)) - } - - if let Ok(signed) = i64::from_str(s) { - return Ok(Number::Signed(signed)) - } - - if let Ok(float) = f64::from_str(s) { - if float == 0.0 || float.is_normal() { - return Ok(Number::Float(float)) - } - } - - Err(ParseNumberError) - } -} - -impl PartialOrd for Number { - fn partial_cmp(&self, other: &Number) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for Number { - fn cmp(&self, other: &Number) -> Ordering { - use Number::*; - match (self, other) { - (Unsigned(s), Unsigned(o)) => s.cmp(o), - (Unsigned(s), Signed(o)) => { - let s = i128::from(*s); - let o = i128::from(*o); - s.cmp(&o) - }, - (Unsigned(s), Float(o)) => { - let s = *s as f64; - s.partial_cmp(&o).unwrap_or(Ordering::Equal) - }, - - (Signed(s), Unsigned(o)) => { - let s = i128::from(*s); - let o = i128::from(*o); - s.cmp(&o) - }, - (Signed(s), Signed(o)) => s.cmp(o), - (Signed(s), Float(o)) => { - let s = *s as f64; - s.partial_cmp(o).unwrap_or(Ordering::Equal) - }, - - (Float(s), Unsigned(o)) => { - let o = *o as f64; - s.partial_cmp(&o).unwrap_or(Ordering::Equal) - }, - (Float(s), Signed(o)) => { - let o = *o as f64; - s.partial_cmp(&o).unwrap_or(Ordering::Equal) - }, - (Float(s), Float(o)) => { - s.partial_cmp(o).unwrap_or(Ordering::Equal) - }, - } - } -} - -impl PartialEq for Number { - fn eq(&self, other: &Number) -> bool { - self.cmp(other) == Ordering::Equal - } -} - -impl Eq for Number { } - -pub struct ParseNumberError; - -impl fmt::Display for ParseNumberError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.write_str("can not parse number") - } -} diff --git a/meilidb/src/database/schema.rs b/meilidb/src/database/schema.rs deleted file mode 100644 index b4e0a070c..000000000 --- a/meilidb/src/database/schema.rs +++ /dev/null @@ -1,319 +0,0 @@ -use std::collections::{HashMap, BTreeMap}; -use std::io::{Read, Write}; -use std::error::Error; -use std::{fmt, u16}; -use std::ops::BitOr; -use std::sync::Arc; - -use serde_derive::{Serialize, Deserialize}; -use linked_hash_map::LinkedHashMap; - -use crate::database::serde::find_id::FindDocumentIdSerializer; -use crate::database::serde::SerializerError; -use meilidb_core::DocumentId; - -pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false }; -pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false }; -pub const RANKED: SchemaProps = SchemaProps { stored: false, indexed: false, ranked: true }; - -#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct SchemaProps { - #[serde(default)] - stored: bool, - - #[serde(default)] - indexed: bool, - - #[serde(default)] - ranked: bool, -} - -impl SchemaProps { - pub fn is_stored(self) -> bool { - self.stored - } - - pub fn is_indexed(self) -> bool { - self.indexed - } - - pub fn is_ranked(self) -> bool { - self.ranked - } -} - -impl BitOr for SchemaProps { - type Output = Self; - - fn bitor(self, other: Self) -> Self::Output { - SchemaProps { - stored: self.stored | other.stored, - indexed: self.indexed | other.indexed, - ranked: self.ranked | other.ranked, - } - } -} - -#[derive(Serialize, Deserialize)] -pub struct SchemaBuilder { - identifier: String, - attributes: LinkedHashMap, -} - -impl SchemaBuilder { - pub fn with_identifier>(name: S) -> SchemaBuilder { - SchemaBuilder { - identifier: name.into(), - attributes: LinkedHashMap::new(), - } - } - - pub fn new_attribute>(&mut self, name: S, props: SchemaProps) -> SchemaAttr { - let len = self.attributes.len(); - if self.attributes.insert(name.into(), props).is_some() { - panic!("Field already inserted.") - } - SchemaAttr(len as u16) - } - - pub fn build(self) -> Schema { - let mut attrs = HashMap::new(); - let mut props = Vec::new(); - - for (i, (name, prop)) in self.attributes.into_iter().enumerate() { - attrs.insert(name.clone(), SchemaAttr(i as u16)); - props.push((name, prop)); - } - - let identifier = self.identifier; - Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) } - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Schema { - inner: Arc, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -struct InnerSchema { - identifier: String, - attrs: HashMap, - props: Vec<(String, SchemaProps)>, -} - -impl Schema { - pub fn from_toml(mut reader: R) -> Result> { - let mut buffer = Vec::new(); - reader.read_to_end(&mut buffer)?; - let builder: SchemaBuilder = toml::from_slice(&buffer)?; - Ok(builder.build()) - } - - pub fn to_toml(&self, mut writer: W) -> Result<(), Box> { - let identifier = self.inner.identifier.clone(); - let attributes = self.attributes_ordered(); - let builder = SchemaBuilder { identifier, attributes }; - - let string = toml::to_string_pretty(&builder)?; - writer.write_all(string.as_bytes())?; - - Ok(()) - } - - pub fn from_json(mut reader: R) -> Result> { - let mut buffer = Vec::new(); - reader.read_to_end(&mut buffer)?; - let builder: SchemaBuilder = serde_json::from_slice(&buffer)?; - Ok(builder.build()) - } - - pub fn to_json(&self, mut writer: W) -> Result<(), Box> { - let identifier = self.inner.identifier.clone(); - let attributes = self.attributes_ordered(); - let builder = SchemaBuilder { identifier, attributes }; - let string = serde_json::to_string_pretty(&builder)?; - writer.write_all(string.as_bytes())?; - - Ok(()) - } - - pub(crate) fn read_from_bin(reader: R) -> bincode::Result { - let builder: SchemaBuilder = bincode::deserialize_from(reader)?; - Ok(builder.build()) - } - - pub(crate) fn write_to_bin(&self, writer: W) -> bincode::Result<()> { - let identifier = self.inner.identifier.clone(); - let attributes = self.attributes_ordered(); - let builder = SchemaBuilder { identifier, attributes }; - - bincode::serialize_into(writer, &builder) - } - - fn attributes_ordered(&self) -> LinkedHashMap { - let mut ordered = BTreeMap::new(); - for (name, attr) in &self.inner.attrs { - let (_, props) = self.inner.props[attr.0 as usize]; - ordered.insert(attr.0, (name, props)); - } - - let mut attributes = LinkedHashMap::with_capacity(ordered.len()); - for (_, (name, props)) in ordered { - attributes.insert(name.clone(), props); - } - - attributes - } - - pub fn document_id(&self, document: T) -> Result - where T: serde::Serialize, - { - let id_attribute_name = &self.inner.identifier; - let serializer = FindDocumentIdSerializer { id_attribute_name }; - document.serialize(serializer) - } - - pub fn props(&self, attr: SchemaAttr) -> SchemaProps { - let (_, props) = self.inner.props[attr.0 as usize]; - props - } - - pub fn identifier_name(&self) -> &str { - &self.inner.identifier - } - - pub fn attribute>(&self, name: S) -> Option { - self.inner.attrs.get(name.as_ref()).cloned() - } - - pub fn attribute_name(&self, attr: SchemaAttr) -> &str { - let (name, _) = &self.inner.props[attr.0 as usize]; - name - } -} - -#[derive(Serialize, Deserialize)] -#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] -pub struct SchemaAttr(pub u16); - -impl SchemaAttr { - pub fn new(value: u16) -> SchemaAttr { - SchemaAttr(value) - } - - pub fn min() -> SchemaAttr { - SchemaAttr(0) - } - - pub fn next(self) -> Option { - self.0.checked_add(1).map(SchemaAttr) - } - - pub fn prev(self) -> Option { - self.0.checked_sub(1).map(SchemaAttr) - } - - pub fn max() -> SchemaAttr { - SchemaAttr(u16::MAX) - } -} - -impl fmt::Display for SchemaAttr { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - self.0.fmt(f) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::error::Error; - - #[test] - fn serialize_deserialize() -> bincode::Result<()> { - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("alpha", STORED); - builder.new_attribute("beta", STORED | INDEXED); - builder.new_attribute("gamma", INDEXED); - let schema = builder.build(); - - let mut buffer = Vec::new(); - - schema.write_to_bin(&mut buffer)?; - let schema2 = Schema::read_from_bin(buffer.as_slice())?; - - assert_eq!(schema, schema2); - - Ok(()) - } - - #[test] - fn serialize_deserialize_toml() -> Result<(), Box> { - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("alpha", STORED); - builder.new_attribute("beta", STORED | INDEXED); - builder.new_attribute("gamma", INDEXED); - let schema = builder.build(); - - let mut buffer = Vec::new(); - schema.to_toml(&mut buffer)?; - - let schema2 = Schema::from_toml(buffer.as_slice())?; - assert_eq!(schema, schema2); - - let data = r#" - identifier = "id" - - [attributes."alpha"] - stored = true - - [attributes."beta"] - stored = true - indexed = true - - [attributes."gamma"] - indexed = true - "#; - let schema2 = Schema::from_toml(data.as_bytes())?; - assert_eq!(schema, schema2); - - Ok(()) - } - - #[test] - fn serialize_deserialize_json() -> Result<(), Box> { - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("alpha", STORED); - builder.new_attribute("beta", STORED | INDEXED); - builder.new_attribute("gamma", INDEXED); - let schema = builder.build(); - - let mut buffer = Vec::new(); - schema.to_json(&mut buffer)?; - - let schema2 = Schema::from_json(buffer.as_slice())?; - assert_eq!(schema, schema2); - - let data = r#" - { - "identifier": "id", - "attributes": { - "alpha": { - "stored": true - }, - "beta": { - "stored": true, - "indexed": true - }, - "gamma": { - "indexed": true - } - } - }"#; - let schema2 = Schema::from_json(data.as_bytes())?; - assert_eq!(schema, schema2); - - Ok(()) - } -} diff --git a/meilidb/src/database/serde/deserializer.rs b/meilidb/src/database/serde/deserializer.rs deleted file mode 100644 index 92374ab48..000000000 --- a/meilidb/src/database/serde/deserializer.rs +++ /dev/null @@ -1,186 +0,0 @@ -use std::error::Error; -use std::ops::Deref; -use std::fmt; - -use rocksdb::rocksdb::{DB, Snapshot, SeekKey}; -use rocksdb::rocksdb_options::ReadOptions; -use serde::forward_to_deserialize_any; -use serde::de::value::MapDeserializer; -use serde::de::{self, Visitor, IntoDeserializer}; - -use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; -use crate::database::schema::Schema; -use meilidb_core::DocumentId; - -pub struct Deserializer<'a, D> -where D: Deref -{ - snapshot: &'a Snapshot, - schema: &'a Schema, - document_id: DocumentId, -} - -impl<'a, D> Deserializer<'a, D> -where D: Deref -{ - pub fn new(snapshot: &'a Snapshot, schema: &'a Schema, doc: DocumentId) -> Self { - Deserializer { snapshot, schema, document_id: doc } - } -} - -impl<'de, 'a, 'b, D> de::Deserializer<'de> for &'b mut Deserializer<'a, D> -where D: Deref -{ - type Error = DeserializerError; - - fn deserialize_any(self, visitor: V) -> Result - where V: Visitor<'de> - { - self.deserialize_map(visitor) - } - - forward_to_deserialize_any! { - bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit seq - bytes byte_buf unit_struct tuple_struct - identifier tuple ignored_any option newtype_struct enum struct - } - - fn deserialize_map(self, visitor: V) -> Result - where V: Visitor<'de> - { - let mut options = ReadOptions::new(); - let lower = DocumentKey::new(self.document_id); - let upper = lower.with_attribute_max(); - options.set_iterate_lower_bound(lower.as_ref()); - options.set_iterate_upper_bound(upper.as_ref()); - - let mut iter = self.snapshot.iter_opt(options); - iter.seek(SeekKey::Start); - - if iter.kv().is_none() { - // FIXME return an error - } - - let iter = iter.map(|(key, value)| { - // retrieve the schema attribute name - // from the schema attribute number - let document_key_attr = DocumentKeyAttr::from_bytes(&key); - let schema_attr = document_key_attr.attribute(); - let attribute_name = self.schema.attribute_name(schema_attr); - (attribute_name, Value(value)) - }); - - let map_deserializer = MapDeserializer::new(iter); - visitor.visit_map(map_deserializer) - } -} - -struct Value(Vec); - -impl<'de> IntoDeserializer<'de, DeserializerError> for Value { - type Deserializer = Self; - - fn into_deserializer(self) -> Self::Deserializer { - self - } -} - -macro_rules! forward_to_bincode_values { - ($($ty:ident => $de_method:ident,)*) => { - $( - fn $de_method(self, visitor: V) -> Result - where V: de::Visitor<'de> - { - match bincode::deserialize::<$ty>(&self.0) { - Ok(val) => val.into_deserializer().$de_method(visitor), - Err(e) => Err(de::Error::custom(e)), - } - } - )* - } -} - -impl<'de, 'a> de::Deserializer<'de> for Value { - type Error = DeserializerError; - - fn deserialize_any(self, visitor: V) -> Result - where V: Visitor<'de> - { - self.0.into_deserializer().deserialize_any(visitor) - } - - fn deserialize_str(self, visitor: V) -> Result - where V: Visitor<'de> - { - self.deserialize_string(visitor) - } - - fn deserialize_string(self, visitor: V) -> Result - where V: Visitor<'de> - { - match bincode::deserialize::(&self.0) { - Ok(val) => val.into_deserializer().deserialize_string(visitor), - Err(e) => Err(de::Error::custom(e)), - } - } - - fn deserialize_bytes(self, visitor: V) -> Result - where V: Visitor<'de> - { - self.deserialize_byte_buf(visitor) - } - - fn deserialize_byte_buf(self, visitor: V) -> Result - where V: Visitor<'de> - { - match bincode::deserialize::>(&self.0) { - Ok(val) => val.into_deserializer().deserialize_byte_buf(visitor), - Err(e) => Err(de::Error::custom(e)), - } - } - - forward_to_bincode_values! { - char => deserialize_char, - bool => deserialize_bool, - - u8 => deserialize_u8, - u16 => deserialize_u16, - u32 => deserialize_u32, - u64 => deserialize_u64, - - i8 => deserialize_i8, - i16 => deserialize_i16, - i32 => deserialize_i32, - i64 => deserialize_i64, - - f32 => deserialize_f32, - f64 => deserialize_f64, - } - - forward_to_deserialize_any! { - unit seq map - unit_struct tuple_struct - identifier tuple ignored_any option newtype_struct enum struct - } -} - -#[derive(Debug)] -pub enum DeserializerError { - Custom(String), -} - -impl de::Error for DeserializerError { - fn custom(msg: T) -> Self { - DeserializerError::Custom(msg.to_string()) - } -} - -impl fmt::Display for DeserializerError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - DeserializerError::Custom(s) => f.write_str(&s), - } - } -} - -impl Error for DeserializerError {} diff --git a/meilidb/src/database/serde/find_id.rs b/meilidb/src/database/serde/find_id.rs deleted file mode 100644 index 3c44b5e35..000000000 --- a/meilidb/src/database/serde/find_id.rs +++ /dev/null @@ -1,243 +0,0 @@ -use serde::Serialize; -use serde::ser; - -use crate::database::serde::key_to_string::KeyToStringSerializer; -use crate::database::serde::{SerializerError, calculate_hash}; -use meilidb_core::DocumentId; - -pub struct FindDocumentIdSerializer<'a> { - pub id_attribute_name: &'a str, -} - -impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> { - type Ok = DocumentId; - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = FindDocumentIdMapSerializer<'a>; - type SerializeStruct = FindDocumentIdStructSerializer<'a>; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, - - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, - - f32 => serialize_f32, - f64 => serialize_f64, - } - - fn serialize_str(self, _v: &str) -> Result { - Err(SerializerError::UnserializableType { name: "str" }) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_some(self, _value: &T) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { name: "unit variant" }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T - ) -> Result - where T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T - ) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "newtype variant" }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple struct" }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple variant" }) - } - - fn serialize_map(self, _len: Option) -> Result { - Ok(FindDocumentIdMapSerializer { - id_attribute_name: self.id_attribute_name, - document_id: None, - current_key_name: None, - }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Ok(FindDocumentIdStructSerializer { - id_attribute_name: self.id_attribute_name, - document_id: None, - }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct variant" }) - } -} - -pub struct FindDocumentIdMapSerializer<'a> { - id_attribute_name: &'a str, - document_id: Option, - current_key_name: Option, -} - -impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> { - type Ok = DocumentId; - type Error = SerializerError; - - fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> - where T: Serialize, - { - let key = key.serialize(KeyToStringSerializer)?; - self.current_key_name = Some(key); - Ok(()) - } - - fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> - where T: Serialize, - { - let key = self.current_key_name.take().unwrap(); - self.serialize_entry(&key, value) - } - - fn serialize_entry( - &mut self, - key: &K, - value: &V - ) -> Result<(), Self::Error> - where K: Serialize, V: Serialize, - { - let key = key.serialize(KeyToStringSerializer)?; - - if self.id_attribute_name == key { - // TODO is it possible to have multiple ids? - let id = bincode::serialize(value).unwrap(); - let hash = calculate_hash(&id); - self.document_id = Some(DocumentId(hash)); - } - - Ok(()) - } - - fn end(self) -> Result { - match self.document_id { - Some(document_id) => Ok(document_id), - None => Err(SerializerError::DocumentIdNotFound) - } - } -} - -pub struct FindDocumentIdStructSerializer<'a> { - id_attribute_name: &'a str, - document_id: Option, -} - -impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> { - type Ok = DocumentId; - type Error = SerializerError; - - fn serialize_field( - &mut self, - key: &'static str, - value: &T - ) -> Result<(), Self::Error> - where T: Serialize, - { - if self.id_attribute_name == key { - // TODO can it be possible to have multiple ids? - let id = bincode::serialize(value).unwrap(); - let hash = calculate_hash(&id); - self.document_id = Some(DocumentId(hash)); - } - - Ok(()) - } - - fn end(self) -> Result { - match self.document_id { - Some(document_id) => Ok(document_id), - None => Err(SerializerError::DocumentIdNotFound) - } - } -} diff --git a/meilidb/src/database/serde/indexer_serializer.rs b/meilidb/src/database/serde/indexer_serializer.rs deleted file mode 100644 index ae5a0e4cb..000000000 --- a/meilidb/src/database/serde/indexer_serializer.rs +++ /dev/null @@ -1,190 +0,0 @@ -use std::collections::HashSet; - -use serde::Serialize; -use serde::ser; -use meilidb_core::{DocumentId, DocIndex}; -use meilidb_tokenizer::{Tokenizer, Token, is_cjk}; - -use crate::database::update::DocumentUpdate; -use crate::database::serde::SerializerError; -use crate::database::schema::SchemaAttr; - -pub struct IndexerSerializer<'a, 'b> { - pub update: &'a mut DocumentUpdate<'b>, - pub document_id: DocumentId, - pub attribute: SchemaAttr, - pub stop_words: &'a HashSet, -} - -impl<'a, 'b> ser::Serializer for IndexerSerializer<'a, 'b> { - type Ok = (); - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = ser::Impossible; - type SerializeStruct = ser::Impossible; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, - - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, - - f32 => serialize_f32, - f64 => serialize_f64, - } - - fn serialize_str(self, v: &str) -> Result { - for token in Tokenizer::new(v) { - let Token { word, word_index, char_index } = token; - let document_id = self.document_id; - - // FIXME must u32::try_from instead - let attribute = self.attribute.0; - let word_index = word_index as u16; - - // insert the exact representation - let word_lower = word.to_lowercase(); - let length = word.chars().count() as u16; - - if self.stop_words.contains(&word_lower) { continue } - - // and the unidecoded lowercased version - if !word_lower.chars().any(is_cjk) { - let word_unidecoded = unidecode::unidecode(word).to_lowercase(); - let word_unidecoded = word_unidecoded.trim(); - if word_lower != word_unidecoded { - let char_index = char_index as u16; - let char_length = length; - - let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; - self.update.insert_doc_index(word_unidecoded.as_bytes().to_vec(), doc_index)?; - } - } - - let char_index = char_index as u16; - let char_length = length; - - let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; - self.update.insert_doc_index(word_lower.into_bytes(), doc_index)?; - } - Ok(()) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_some(self, _value: &T) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { name: "unit variant" }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T - ) -> Result - where T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T - ) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "newtype variant" }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "seq" }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple struct" }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple variant" }) - } - - fn serialize_map(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "map" }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct" }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct variant" }) - } -} diff --git a/meilidb/src/database/serde/key_to_string.rs b/meilidb/src/database/serde/key_to_string.rs deleted file mode 100644 index 2fe0c5a39..000000000 --- a/meilidb/src/database/serde/key_to_string.rs +++ /dev/null @@ -1,146 +0,0 @@ -use serde::Serialize; -use serde::ser; - -use crate::database::serde::SerializerError; - -pub struct KeyToStringSerializer; - -impl ser::Serializer for KeyToStringSerializer { - type Ok = String; - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = ser::Impossible; - type SerializeStruct = ser::Impossible; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, - - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, - - f32 => serialize_f32, - f64 => serialize_f64, - } - - fn serialize_str(self, value: &str) -> Result { - Ok(value.to_string()) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_some(self, _value: &T) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { name: "unit variant" }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T - ) -> Result - where T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T - ) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "newtype variant" }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple struct" }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple variant" }) - } - - fn serialize_map(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "map" }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct" }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct variant" }) - } -} diff --git a/meilidb/src/database/serde/mod.rs b/meilidb/src/database/serde/mod.rs deleted file mode 100644 index 493124f7e..000000000 --- a/meilidb/src/database/serde/mod.rs +++ /dev/null @@ -1,65 +0,0 @@ -use std::collections::hash_map::DefaultHasher; -use std::hash::{Hash, Hasher}; -use std::error::Error; -use std::fmt; - -use serde::ser; - -macro_rules! forward_to_unserializable_type { - ($($ty:ident => $se_method:ident,)*) => { - $( - fn $se_method(self, _v: $ty) -> Result { - Err(SerializerError::UnserializableType { name: "$ty" }) - } - )* - } -} - -pub mod find_id; -pub mod key_to_string; -pub mod value_to_number; -pub mod serializer; -pub mod indexer_serializer; -pub mod deserializer; - -pub fn calculate_hash(t: &T) -> u64 { - let mut s = DefaultHasher::new(); - t.hash(&mut s); - s.finish() -} - -#[derive(Debug)] -pub enum SerializerError { - DocumentIdNotFound, - UnserializableType { name: &'static str }, - Custom(String), -} - -impl ser::Error for SerializerError { - fn custom(msg: T) -> Self { - SerializerError::Custom(msg.to_string()) - } -} - -impl fmt::Display for SerializerError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - SerializerError::DocumentIdNotFound => { - write!(f, "serialized document does not have an id according to the schema") - } - SerializerError::UnserializableType { name } => { - write!(f, "Only struct and map types are considered valid documents and - can be serialized, not {} types directly.", name) - }, - SerializerError::Custom(s) => f.write_str(&s), - } - } -} - -impl Error for SerializerError {} - -impl From for SerializerError { - fn from(value: String) -> SerializerError { - SerializerError::Custom(value) - } -} diff --git a/meilidb/src/database/serde/serializer.rs b/meilidb/src/database/serde/serializer.rs deleted file mode 100644 index e1be310ed..000000000 --- a/meilidb/src/database/serde/serializer.rs +++ /dev/null @@ -1,282 +0,0 @@ -use std::collections::HashSet; - -use serde::Serialize; -use serde::ser; - -use crate::database::serde::indexer_serializer::IndexerSerializer; -use crate::database::serde::key_to_string::KeyToStringSerializer; -use crate::database::serde::value_to_number::ValueToNumberSerializer; -use crate::database::update::DocumentUpdate; -use crate::database::serde::SerializerError; -use crate::database::schema::Schema; -use meilidb_core::DocumentId; - -pub struct Serializer<'a, 'b> { - pub schema: &'a Schema, - pub update: &'a mut DocumentUpdate<'b>, - pub document_id: DocumentId, - pub stop_words: &'a HashSet, -} - -impl<'a, 'b> ser::Serializer for Serializer<'a, 'b> { - type Ok = (); - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = MapSerializer<'a, 'b>; - type SerializeStruct = StructSerializer<'a, 'b>; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, - - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, - - f32 => serialize_f32, - f64 => serialize_f64, - } - - fn serialize_str(self, _v: &str) -> Result { - Err(SerializerError::UnserializableType { name: "str" }) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_some(self, _value: &T) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { name: "unit variant" }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T - ) -> Result - where T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T - ) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "newtype variant" }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple struct" }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple variant" }) - } - - fn serialize_map(self, _len: Option) -> Result { - Ok(MapSerializer { - schema: self.schema, - document_id: self.document_id, - update: self.update, - stop_words: self.stop_words, - current_key_name: None, - }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Ok(StructSerializer { - schema: self.schema, - document_id: self.document_id, - update: self.update, - stop_words: self.stop_words, - }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct variant" }) - } -} - -pub struct MapSerializer<'a, 'b> { - pub schema: &'a Schema, - pub document_id: DocumentId, - pub update: &'a mut DocumentUpdate<'b>, - pub stop_words: &'a HashSet, - pub current_key_name: Option, -} - -impl<'a, 'b> ser::SerializeMap for MapSerializer<'a, 'b> { - type Ok = (); - type Error = SerializerError; - - fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> - where T: Serialize, - { - let key = key.serialize(KeyToStringSerializer)?; - self.current_key_name = Some(key); - Ok(()) - } - - fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> - where T: Serialize, - { - let key = self.current_key_name.take().unwrap(); - self.serialize_entry(&key, value) - } - - fn serialize_entry( - &mut self, - key: &K, - value: &V, - ) -> Result<(), Self::Error> - where K: Serialize, V: Serialize, - { - let key = key.serialize(KeyToStringSerializer)?; - - if let Some(attr) = self.schema.attribute(key) { - let props = self.schema.props(attr); - if props.is_stored() { - let value = bincode::serialize(value).unwrap(); - self.update.insert_attribute_value(attr, &value)?; - } - if props.is_indexed() { - let serializer = IndexerSerializer { - update: self.update, - document_id: self.document_id, - attribute: attr, - stop_words: self.stop_words, - }; - value.serialize(serializer)?; - } - if props.is_ranked() { - let number = value.serialize(ValueToNumberSerializer)?; - self.update.register_ranked_attribute(attr, number)?; - } - } - - Ok(()) - } - - fn end(self) -> Result { - Ok(()) - } -} - -pub struct StructSerializer<'a, 'b> { - pub schema: &'a Schema, - pub document_id: DocumentId, - pub update: &'a mut DocumentUpdate<'b>, - pub stop_words: &'a HashSet, -} - -impl<'a, 'b> ser::SerializeStruct for StructSerializer<'a, 'b> { - type Ok = (); - type Error = SerializerError; - - fn serialize_field( - &mut self, - key: &'static str, - value: &T - ) -> Result<(), Self::Error> - where T: Serialize, - { - if let Some(attr) = self.schema.attribute(key) { - let props = self.schema.props(attr); - if props.is_stored() { - let value = bincode::serialize(value).unwrap(); - self.update.insert_attribute_value(attr, &value)?; - } - if props.is_indexed() { - let serializer = IndexerSerializer { - update: self.update, - document_id: self.document_id, - attribute: attr, - stop_words: self.stop_words, - }; - value.serialize(serializer)?; - } - if props.is_ranked() { - let integer = value.serialize(ValueToNumberSerializer)?; - self.update.register_ranked_attribute(attr, integer)?; - } - } - - Ok(()) - } - - fn end(self) -> Result { - Ok(()) - } -} diff --git a/meilidb/src/database/serde/value_to_number.rs b/meilidb/src/database/serde/value_to_number.rs deleted file mode 100644 index a70b92fc4..000000000 --- a/meilidb/src/database/serde/value_to_number.rs +++ /dev/null @@ -1,176 +0,0 @@ -use std::str::FromStr; - -use serde::Serialize; -use serde::{ser, ser::Error}; - -use crate::database::serde::SerializerError; -use crate::database::Number; - -pub struct ValueToNumberSerializer; - -impl ser::Serializer for ValueToNumberSerializer { - type Ok = Number; - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = ser::Impossible; - type SerializeStruct = ser::Impossible; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - } - - fn serialize_i8(self, value: i8) -> Result { - Ok(Number::Signed(value as i64)) - } - - fn serialize_i16(self, value: i16) -> Result { - Ok(Number::Signed(value as i64)) - } - - fn serialize_i32(self, value: i32) -> Result { - Ok(Number::Signed(value as i64)) - } - - fn serialize_i64(self, value: i64) -> Result { - Ok(Number::Signed(value as i64)) - } - - fn serialize_u8(self, value: u8) -> Result { - Ok(Number::Unsigned(value as u64)) - } - - fn serialize_u16(self, value: u16) -> Result { - Ok(Number::Unsigned(value as u64)) - } - - fn serialize_u32(self, value: u32) -> Result { - Ok(Number::Unsigned(value as u64)) - } - - fn serialize_u64(self, value: u64) -> Result { - Ok(Number::Unsigned(value as u64)) - } - - fn serialize_f32(self, value: f32) -> Result { - Ok(Number::Float(value as f64)) - } - - fn serialize_f64(self, value: f64) -> Result { - Ok(Number::Float(value)) - } - - fn serialize_str(self, value: &str) -> Result { - Number::from_str(value).map_err(SerializerError::custom) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_some(self, _value: &T) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { name: "unit variant" }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T - ) -> Result - where T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T - ) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "newtype variant" }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple struct" }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple variant" }) - } - - fn serialize_map(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "map" }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct" }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct variant" }) - } -} diff --git a/meilidb/src/database/update/index_event.rs b/meilidb/src/database/update/index_event.rs deleted file mode 100644 index 20dbcbf46..000000000 --- a/meilidb/src/database/update/index_event.rs +++ /dev/null @@ -1,55 +0,0 @@ -use std::error::Error; - -use byteorder::{ReadBytesExt, WriteBytesExt}; -use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; -use meilidb_core::write_to_bytes::WriteToBytes; -use meilidb_core::data::DocIds; - -use crate::database::Index; - -pub enum WriteIndexEvent<'a> { - RemovedDocuments(&'a DocIds), - UpdatedDocuments(&'a Index), -} - -impl<'a> WriteToBytes for WriteIndexEvent<'a> { - fn write_to_bytes(&self, bytes: &mut Vec) { - match self { - WriteIndexEvent::RemovedDocuments(doc_ids) => { - let _ = bytes.write_u8(0); - doc_ids.write_to_bytes(bytes); - }, - WriteIndexEvent::UpdatedDocuments(index) => { - let _ = bytes.write_u8(1); - index.write_to_bytes(bytes); - } - } - } -} - -pub enum ReadIndexEvent { - RemovedDocuments(DocIds), - UpdatedDocuments(Index), -} - -impl ReadIndexEvent { - pub fn updated_documents(self) -> Option { - use ReadIndexEvent::*; - match self { - RemovedDocuments(_) => None, - UpdatedDocuments(index) => Some(index), - } - } -} - -impl FromSharedDataCursor for ReadIndexEvent { - type Error = Box; - - fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result { - match cursor.read_u8()? { - 0 => DocIds::from_shared_data_cursor(cursor).map(ReadIndexEvent::RemovedDocuments), - 1 => Index::from_shared_data_cursor(cursor).map(ReadIndexEvent::UpdatedDocuments), - _ => unreachable!(), - } - } -} diff --git a/meilidb/src/database/update/mod.rs b/meilidb/src/database/update/mod.rs deleted file mode 100644 index f34cf6a8e..000000000 --- a/meilidb/src/database/update/mod.rs +++ /dev/null @@ -1,234 +0,0 @@ -use std::collections::{HashSet, BTreeMap}; -use std::error::Error; - -use rocksdb::rocksdb::{Writable, WriteBatch}; -use hashbrown::hash_map::HashMap; -use sdset::{Set, SetBuf}; -use serde::Serialize; -use meilidb_core::write_to_bytes::WriteToBytes; -use meilidb_core::data::DocIds; -use meilidb_core::{IndexBuilder, DocumentId, DocIndex}; - -use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; -use crate::database::serde::serializer::Serializer; -use crate::database::serde::SerializerError; -use crate::database::schema::SchemaAttr; -use crate::database::schema::Schema; -use crate::database::{DATA_INDEX, DATA_RANKED_MAP}; -use crate::database::{RankedMap, Number}; - -pub use self::index_event::{ReadIndexEvent, WriteIndexEvent}; -pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent}; - -mod index_event; -mod ranked_map_event; - -pub type Token = Vec; // TODO could be replaced by a SmallVec - -pub struct Update { - schema: Schema, - raw_builder: RawUpdateBuilder, -} - -impl Update { - pub(crate) fn new(schema: Schema) -> Update { - Update { schema, raw_builder: RawUpdateBuilder::new() } - } - - pub fn update_document( - &mut self, - document: T, - stop_words: &HashSet, - ) -> Result - where T: Serialize, - { - let document_id = self.schema.document_id(&document)?; - - let serializer = Serializer { - schema: &self.schema, - document_id: document_id, - update: &mut self.raw_builder.document_update(document_id)?, - stop_words: stop_words, - }; - - document.serialize(serializer)?; - - Ok(document_id) - } - - pub fn remove_document(&mut self, document: T) -> Result - where T: Serialize, - { - let document_id = self.schema.document_id(&document)?; - self.raw_builder.document_update(document_id)?.remove()?; - Ok(document_id) - } - - pub(crate) fn build(self) -> Result> { - self.raw_builder.build() - } -} - -#[derive(Copy, Clone, PartialEq, Eq)] -enum UpdateType { - Updated, - Deleted, -} - -use UpdateType::{Updated, Deleted}; - -pub struct RawUpdateBuilder { - documents_update: HashMap, - documents_ranked_fields: RankedMap, - indexed_words: BTreeMap>, - batch: WriteBatch, -} - -impl RawUpdateBuilder { - pub fn new() -> RawUpdateBuilder { - RawUpdateBuilder { - documents_update: HashMap::new(), - documents_ranked_fields: HashMap::new(), - indexed_words: BTreeMap::new(), - batch: WriteBatch::new(), - } - } - - pub fn document_update(&mut self, document_id: DocumentId) -> Result { - use serde::ser::Error; - - match self.documents_update.get(&document_id) { - Some(Deleted) | None => Ok(DocumentUpdate { document_id, inner: self }), - Some(Updated) => Err(SerializerError::custom( - "This document has already been removed and cannot be updated in the same update" - )), - } - } - - pub fn build(self) -> Result> { - // create the list of all the removed documents - let removed_documents = { - let mut document_ids = Vec::new(); - for (id, update_type) in self.documents_update { - if update_type == Deleted { - document_ids.push(id); - } - } - - document_ids.sort_unstable(); - let setbuf = SetBuf::new_unchecked(document_ids); - DocIds::new(&setbuf) - }; - - // create the Index of all the document updates - let index = { - let mut builder = IndexBuilder::new(); - for (key, mut indexes) in self.indexed_words { - indexes.sort_unstable(); - let indexes = Set::new_unchecked(&indexes); - builder.insert(key, indexes).unwrap(); - } - builder.build() - }; - - // WARN: removed documents must absolutely - // be merged *before* document updates - - // === index === - - if !removed_documents.is_empty() { - // remove the documents using the appropriate IndexEvent - let event_bytes = WriteIndexEvent::RemovedDocuments(&removed_documents).into_bytes(); - self.batch.merge(DATA_INDEX, &event_bytes)?; - } - - // update the documents using the appropriate IndexEvent - let event_bytes = WriteIndexEvent::UpdatedDocuments(&index).into_bytes(); - self.batch.merge(DATA_INDEX, &event_bytes)?; - - // === ranked map === - - if !removed_documents.is_empty() { - // update the ranked map using the appropriate RankedMapEvent - let event_bytes = WriteRankedMapEvent::RemovedDocuments(&removed_documents).into_bytes(); - self.batch.merge(DATA_RANKED_MAP, &event_bytes)?; - } - - // update the documents using the appropriate IndexEvent - let event_bytes = WriteRankedMapEvent::UpdatedDocuments(&self.documents_ranked_fields).into_bytes(); - self.batch.merge(DATA_RANKED_MAP, &event_bytes)?; - - Ok(self.batch) - } -} - -pub struct DocumentUpdate<'a> { - document_id: DocumentId, - inner: &'a mut RawUpdateBuilder, -} - -impl<'a> DocumentUpdate<'a> { - pub fn remove(&mut self) -> Result<(), SerializerError> { - use serde::ser::Error; - - if let Updated = self.inner.documents_update.entry(self.document_id).or_insert(Deleted) { - return Err(SerializerError::custom( - "This document has already been updated and cannot be removed in the same update" - )); - } - - let start = DocumentKey::new(self.document_id).with_attribute_min(); - let end = DocumentKey::new(self.document_id).with_attribute_max(); // FIXME max + 1 - self.inner.batch.delete_range(start.as_ref(), end.as_ref())?; - - Ok(()) - } - - pub fn insert_attribute_value(&mut self, attr: SchemaAttr, value: &[u8]) -> Result<(), SerializerError> { - use serde::ser::Error; - - if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) { - return Err(SerializerError::custom( - "This document has already been deleted and cannot be updated in the same update" - )); - } - - let key = DocumentKeyAttr::new(self.document_id, attr); - self.inner.batch.put(key.as_ref(), &value)?; - - Ok(()) - } - - pub fn insert_doc_index(&mut self, token: Token, doc_index: DocIndex) -> Result<(), SerializerError> { - use serde::ser::Error; - - if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) { - return Err(SerializerError::custom( - "This document has already been deleted and cannot be updated in the same update" - )); - } - - self.inner.indexed_words.entry(token).or_insert_with(Vec::new).push(doc_index); - - Ok(()) - } - - pub fn register_ranked_attribute( - &mut self, - attr: SchemaAttr, - number: Number, - ) -> Result<(), SerializerError> - { - use serde::ser::Error; - - if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) { - return Err(SerializerError::custom( - "This document has already been deleted, ranked attributes cannot be added in the same update" - )); - } - - self.inner.documents_ranked_fields.insert((self.document_id, attr), number); - - Ok(()) - } -} diff --git a/meilidb/src/database/update/ranked_map_event.rs b/meilidb/src/database/update/ranked_map_event.rs deleted file mode 100644 index 428bc62cf..000000000 --- a/meilidb/src/database/update/ranked_map_event.rs +++ /dev/null @@ -1,58 +0,0 @@ -use std::error::Error; - -use byteorder::{ReadBytesExt, WriteBytesExt}; -use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; -use meilidb_core::write_to_bytes::WriteToBytes; -use meilidb_core::data::DocIds; - -use crate::database::RankedMap; - -pub enum WriteRankedMapEvent<'a> { - RemovedDocuments(&'a DocIds), - UpdatedDocuments(&'a RankedMap), -} - -impl<'a> WriteToBytes for WriteRankedMapEvent<'a> { - fn write_to_bytes(&self, bytes: &mut Vec) { - match self { - WriteRankedMapEvent::RemovedDocuments(doc_ids) => { - let _ = bytes.write_u8(0); - doc_ids.write_to_bytes(bytes); - }, - WriteRankedMapEvent::UpdatedDocuments(ranked_map) => { - let _ = bytes.write_u8(1); - bincode::serialize_into(bytes, ranked_map).unwrap() - } - } - } -} - -pub enum ReadRankedMapEvent { - RemovedDocuments(DocIds), - UpdatedDocuments(RankedMap), -} - -impl ReadRankedMapEvent { - pub fn updated_documents(self) -> Option { - use ReadRankedMapEvent::*; - match self { - RemovedDocuments(_) => None, - UpdatedDocuments(ranked_map) => Some(ranked_map), - } - } -} - -impl FromSharedDataCursor for ReadRankedMapEvent { - type Error = Box; - - fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result { - match cursor.read_u8()? { - 0 => DocIds::from_shared_data_cursor(cursor).map(ReadRankedMapEvent::RemovedDocuments), - 1 => { - let ranked_map = bincode::deserialize_from(cursor)?; - Ok(ReadRankedMapEvent::UpdatedDocuments(ranked_map)) - }, - _ => unreachable!(), - } - } -} diff --git a/meilidb/src/database/view.rs b/meilidb/src/database/view.rs deleted file mode 100644 index fcbb3fea1..000000000 --- a/meilidb/src/database/view.rs +++ /dev/null @@ -1,199 +0,0 @@ -use std::error::Error; -use std::path::Path; -use std::ops::Deref; -use std::{fmt, marker}; - -use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions}; -use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter}; -use serde::de::DeserializeOwned; -use meilidb_core::{Index, QueryBuilder, DocumentId}; - -use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map, retrieve_config}; -use crate::database::serde::deserializer::Deserializer; -use crate::database::{DocumentKey, DocumentKeyAttr}; -use crate::database::schema::Schema; -use crate::database::RankedMap; -use crate::database::Config; - -pub struct DatabaseView -where D: Deref -{ - snapshot: Snapshot, - index: Index, - ranked_map: RankedMap, - schema: Schema, - config: Config, -} - -impl DatabaseView -where D: Deref -{ - pub fn new(snapshot: Snapshot) -> Result, Box> { - let schema = retrieve_data_schema(&snapshot)?; - let index = retrieve_data_index(&snapshot)?; - let ranked_map = retrieve_data_ranked_map(&snapshot)?; - let config = retrieve_config(&snapshot)?; - Ok(DatabaseView { snapshot, index, ranked_map, schema, config }) - } - - pub fn schema(&self) -> &Schema { - &self.schema - } - - pub fn index(&self) -> &Index { - &self.index - } - - pub fn ranked_map(&self) -> &RankedMap { - &self.ranked_map - } - - pub fn into_snapshot(self) -> Snapshot { - self.snapshot - } - - pub fn snapshot(&self) -> &Snapshot { - &self.snapshot - } - - pub fn config(&self) -> &Config { - &self.config - } - - pub fn get(&self, key: &[u8]) -> Result, Box> { - Ok(self.snapshot.get(key)?) - } - - pub fn dump_all>(&self, path: P) -> Result<(), Box> { - let path = path.as_ref().to_string_lossy(); - - let env_options = EnvOptions::new(); - let column_family_options = ColumnFamilyOptions::new(); - let mut file_writer = SstFileWriter::new(env_options, column_family_options); - file_writer.open(&path)?; - - let mut iter = self.snapshot.iter(); - iter.seek(SeekKey::Start); - - for (key, value) in &mut iter { - file_writer.put(&key, &value)?; - } - - file_writer.finish()?; - Ok(()) - } - - pub fn query_builder(&self) -> QueryBuilder { - QueryBuilder::new(self.index()) - } - - pub fn raw_field_by_document_id( - &self, - name: &str, - id: DocumentId - ) -> Result>, Box> - { - let attr = self.schema.attribute(name).ok_or("field not found")?; - let key = DocumentKeyAttr::new(id, attr); - let vector = self.snapshot.get(key.as_ref())?; - - Ok(vector.map(|v| v.to_vec())) - } - - pub fn document_by_id(&self, id: DocumentId) -> Result> - where T: DeserializeOwned, - { - let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id); - Ok(T::deserialize(&mut deserializer)?) - } - - pub fn documents_by_id(&self, ids: I) -> DocumentIter - where T: DeserializeOwned, - I: IntoIterator, - { - DocumentIter { - database_view: self, - document_ids: ids.into_iter(), - _phantom: marker::PhantomData, - } - } -} - -impl fmt::Debug for DatabaseView -where D: Deref -{ - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let mut options = ReadOptions::new(); - let lower = DocumentKey::new(DocumentId(0)); - options.set_iterate_lower_bound(lower.as_ref()); - - let mut iter = self.snapshot.iter_opt(options); - iter.seek(SeekKey::Start); - let iter = iter.map(|(key, _)| DocumentKeyAttr::from_bytes(&key)); - - if f.alternate() { - writeln!(f, "DatabaseView(")?; - } else { - write!(f, "DatabaseView(")?; - } - - self.schema.fmt(f)?; - - if f.alternate() { - writeln!(f, ",")?; - } else { - write!(f, ", ")?; - } - - f.debug_list().entries(iter).finish()?; - - write!(f, ")") - } -} - -// TODO this is just an iter::Map !!! -pub struct DocumentIter<'a, D, T, I> -where D: Deref -{ - database_view: &'a DatabaseView, - document_ids: I, - _phantom: marker::PhantomData, -} - -impl<'a, D, T, I> Iterator for DocumentIter<'a, D, T, I> -where D: Deref, - T: DeserializeOwned, - I: Iterator, -{ - type Item = Result>; - - fn size_hint(&self) -> (usize, Option) { - self.document_ids.size_hint() - } - - fn next(&mut self) -> Option { - match self.document_ids.next() { - Some(id) => Some(self.database_view.document_by_id(id)), - None => None - } - } -} - -impl<'a, D, T, I> ExactSizeIterator for DocumentIter<'a, D, T, I> -where D: Deref, - T: DeserializeOwned, - I: ExactSizeIterator + Iterator, -{ } - -impl<'a, D, T, I> DoubleEndedIterator for DocumentIter<'a, D, T, I> -where D: Deref, - T: DeserializeOwned, - I: DoubleEndedIterator + Iterator, -{ - fn next_back(&mut self) -> Option { - match self.document_ids.next_back() { - Some(id) => Some(self.database_view.document_by_id(id)), - None => None - } - } -} diff --git a/meilidb/src/lib.rs b/meilidb/src/lib.rs index 325df65eb..aba7ab6a7 100644 --- a/meilidb/src/lib.rs +++ b/meilidb/src/lib.rs @@ -1,10 +1,7 @@ #![cfg_attr(feature = "nightly", feature(test))] -pub mod database; mod common_words; mod sort_by_attr; -pub use rocksdb; - pub use self::sort_by_attr::SortByAttr; pub use self::common_words::CommonWords; From 287d5dee4dab1140ee77306b0a02006055096e79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 29 Mar 2019 17:01:10 +0100 Subject: [PATCH 17/44] feat: Introduce the meilidb-data workspace member --- Cargo.toml | 1 + meilidb-data/Cargo.toml | 8 ++++++++ meilidb-data/src/database.rs | 21 +++++++++++++++++++++ meilidb-data/src/lib.rs | 3 +++ 4 files changed, 33 insertions(+) create mode 100644 meilidb-data/Cargo.toml create mode 100644 meilidb-data/src/database.rs create mode 100644 meilidb-data/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index 1d97a68c7..69297052b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ members = [ "meilidb", "meilidb-core", + "meilidb-data", "meilidb-tokenizer", ] diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml new file mode 100644 index 000000000..82d1bd8d4 --- /dev/null +++ b/meilidb-data/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "meilidb-data" +version = "0.1.0" +authors = ["Kerollmops "] +edition = "2018" + +[dependencies] +sled = "0.20.0" diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs new file mode 100644 index 000000000..7f52dccda --- /dev/null +++ b/meilidb-data/src/database.rs @@ -0,0 +1,21 @@ +use std::path::Path; +use std::sync::Arc; + +#[derive(Clone)] +pub struct Database(sled::Db); + +impl Database { + pub fn start_default>(path: P) -> sled::Result { + sled::Db::start_default(path).map(Database) + } + + pub fn open_index(&self, name: &str) -> sled::Result { + let name = format!("index-{}", name); + let bytes = name.into_bytes(); + + self.0.open_tree(bytes).map(Index) + } +} + +#[derive(Debug, Clone)] +pub struct Index(Arc); diff --git a/meilidb-data/src/lib.rs b/meilidb-data/src/lib.rs new file mode 100644 index 000000000..4308ec6ad --- /dev/null +++ b/meilidb-data/src/lib.rs @@ -0,0 +1,3 @@ +mod database; + +pub use self::database::{Database, Index}; From 95dfbd1fe093ca180802fa4e0ffbab8664a0e201 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 29 Mar 2019 17:28:54 +0100 Subject: [PATCH 18/44] feat: Introduce the meilidb-data schema module --- meilidb-core/Cargo.toml | 3 +- meilidb-core/src/lib.rs | 2 +- meilidb-data/Cargo.toml | 6 + meilidb-data/src/lib.rs | 2 + meilidb-data/src/schema.rs | 309 ++++++++++++++++++++++++++++++++++++ meilidb/Cargo.toml | 1 + meilidb/src/sort_by_attr.rs | 4 +- 7 files changed, 322 insertions(+), 5 deletions(-) create mode 100644 meilidb-data/src/schema.rs diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index fbac7dbe2..233243016 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -14,8 +14,7 @@ log = "0.4.6" meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } rayon = "1.0.3" sdset = "0.3.1" -serde = "1.0.88" -serde_derive = "1.0.88" +serde = { version = "1.0.88", features = ["derive"] } slice-group-by = "0.2.4" [features] diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 11c734e37..18e9a99cc 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -9,7 +9,7 @@ pub mod shared_data_cursor; pub mod write_to_bytes; use std::sync::Arc; -use serde_derive::{Serialize, Deserialize}; +use serde::{Serialize, Deserialize}; use slice_group_by::GroupBy; use rayon::slice::ParallelSliceMut; diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index 82d1bd8d4..f0046bc1a 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -5,4 +5,10 @@ authors = ["Kerollmops "] edition = "2018" [dependencies] +bincode = "1.1.2" +linked-hash-map = { version = "0.5.2", features = ["serde_impl"] } +meilidb-core = { path = "../meilidb-core", version = "0.1.0" } +serde = { version = "1.0.88", features = ["derive"] } +serde_json = { version = "1.0.39", features = ["preserve_order"] } sled = "0.20.0" +toml = { version = "0.5.0", features = ["preserve_order"] } diff --git a/meilidb-data/src/lib.rs b/meilidb-data/src/lib.rs index 4308ec6ad..a2f028ecd 100644 --- a/meilidb-data/src/lib.rs +++ b/meilidb-data/src/lib.rs @@ -1,3 +1,5 @@ mod database; +mod schema; pub use self::database::{Database, Index}; +pub use self::schema::{Schema, SchemaAttr, SchemaBuilder}; diff --git a/meilidb-data/src/schema.rs b/meilidb-data/src/schema.rs new file mode 100644 index 000000000..c73b8b067 --- /dev/null +++ b/meilidb-data/src/schema.rs @@ -0,0 +1,309 @@ +use std::collections::{HashMap, BTreeMap}; +use std::io::{Read, Write}; +use std::error::Error; +use std::{fmt, u16}; +use std::ops::BitOr; +use std::sync::Arc; + +use serde::{Serialize, Deserialize}; +use linked_hash_map::LinkedHashMap; + +use meilidb_core::DocumentId; + +pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false }; +pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false }; +pub const RANKED: SchemaProps = SchemaProps { stored: false, indexed: false, ranked: true }; + +#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SchemaProps { + #[serde(default)] + stored: bool, + + #[serde(default)] + indexed: bool, + + #[serde(default)] + ranked: bool, +} + +impl SchemaProps { + pub fn is_stored(self) -> bool { + self.stored + } + + pub fn is_indexed(self) -> bool { + self.indexed + } + + pub fn is_ranked(self) -> bool { + self.ranked + } +} + +impl BitOr for SchemaProps { + type Output = Self; + + fn bitor(self, other: Self) -> Self::Output { + SchemaProps { + stored: self.stored | other.stored, + indexed: self.indexed | other.indexed, + ranked: self.ranked | other.ranked, + } + } +} + +#[derive(Serialize, Deserialize)] +pub struct SchemaBuilder { + identifier: String, + attributes: LinkedHashMap, +} + +impl SchemaBuilder { + pub fn with_identifier>(name: S) -> SchemaBuilder { + SchemaBuilder { + identifier: name.into(), + attributes: LinkedHashMap::new(), + } + } + + pub fn new_attribute>(&mut self, name: S, props: SchemaProps) -> SchemaAttr { + let len = self.attributes.len(); + if self.attributes.insert(name.into(), props).is_some() { + panic!("Field already inserted.") + } + SchemaAttr(len as u16) + } + + pub fn build(self) -> Schema { + let mut attrs = HashMap::new(); + let mut props = Vec::new(); + + for (i, (name, prop)) in self.attributes.into_iter().enumerate() { + attrs.insert(name.clone(), SchemaAttr(i as u16)); + props.push((name, prop)); + } + + let identifier = self.identifier; + Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Schema { + inner: Arc, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct InnerSchema { + identifier: String, + attrs: HashMap, + props: Vec<(String, SchemaProps)>, +} + +impl Schema { + pub fn from_toml(mut reader: R) -> Result> { + let mut buffer = Vec::new(); + reader.read_to_end(&mut buffer)?; + let builder: SchemaBuilder = toml::from_slice(&buffer)?; + Ok(builder.build()) + } + + pub fn to_toml(&self, mut writer: W) -> Result<(), Box> { + let identifier = self.inner.identifier.clone(); + let attributes = self.attributes_ordered(); + let builder = SchemaBuilder { identifier, attributes }; + + let string = toml::to_string_pretty(&builder)?; + writer.write_all(string.as_bytes())?; + + Ok(()) + } + + pub fn from_json(mut reader: R) -> Result> { + let mut buffer = Vec::new(); + reader.read_to_end(&mut buffer)?; + let builder: SchemaBuilder = serde_json::from_slice(&buffer)?; + Ok(builder.build()) + } + + pub fn to_json(&self, mut writer: W) -> Result<(), Box> { + let identifier = self.inner.identifier.clone(); + let attributes = self.attributes_ordered(); + let builder = SchemaBuilder { identifier, attributes }; + let string = serde_json::to_string_pretty(&builder)?; + writer.write_all(string.as_bytes())?; + + Ok(()) + } + + pub(crate) fn read_from_bin(reader: R) -> bincode::Result { + let builder: SchemaBuilder = bincode::deserialize_from(reader)?; + Ok(builder.build()) + } + + pub(crate) fn write_to_bin(&self, writer: W) -> bincode::Result<()> { + let identifier = self.inner.identifier.clone(); + let attributes = self.attributes_ordered(); + let builder = SchemaBuilder { identifier, attributes }; + + bincode::serialize_into(writer, &builder) + } + + fn attributes_ordered(&self) -> LinkedHashMap { + let mut ordered = BTreeMap::new(); + for (name, attr) in &self.inner.attrs { + let (_, props) = self.inner.props[attr.0 as usize]; + ordered.insert(attr.0, (name, props)); + } + + let mut attributes = LinkedHashMap::with_capacity(ordered.len()); + for (_, (name, props)) in ordered { + attributes.insert(name.clone(), props); + } + + attributes + } + + pub fn props(&self, attr: SchemaAttr) -> SchemaProps { + let (_, props) = self.inner.props[attr.0 as usize]; + props + } + + pub fn identifier_name(&self) -> &str { + &self.inner.identifier + } + + pub fn attribute>(&self, name: S) -> Option { + self.inner.attrs.get(name.as_ref()).cloned() + } + + pub fn attribute_name(&self, attr: SchemaAttr) -> &str { + let (name, _) = &self.inner.props[attr.0 as usize]; + name + } +} + +#[derive(Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub struct SchemaAttr(pub u16); + +impl SchemaAttr { + pub fn new(value: u16) -> SchemaAttr { + SchemaAttr(value) + } + + pub fn min() -> SchemaAttr { + SchemaAttr(0) + } + + pub fn next(self) -> Option { + self.0.checked_add(1).map(SchemaAttr) + } + + pub fn prev(self) -> Option { + self.0.checked_sub(1).map(SchemaAttr) + } + + pub fn max() -> SchemaAttr { + SchemaAttr(u16::MAX) + } +} + +impl fmt::Display for SchemaAttr { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.0.fmt(f) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::error::Error; + + #[test] + fn serialize_deserialize() -> bincode::Result<()> { + let mut builder = SchemaBuilder::with_identifier("id"); + builder.new_attribute("alpha", STORED); + builder.new_attribute("beta", STORED | INDEXED); + builder.new_attribute("gamma", INDEXED); + let schema = builder.build(); + + let mut buffer = Vec::new(); + + schema.write_to_bin(&mut buffer)?; + let schema2 = Schema::read_from_bin(buffer.as_slice())?; + + assert_eq!(schema, schema2); + + Ok(()) + } + + #[test] + fn serialize_deserialize_toml() -> Result<(), Box> { + let mut builder = SchemaBuilder::with_identifier("id"); + builder.new_attribute("alpha", STORED); + builder.new_attribute("beta", STORED | INDEXED); + builder.new_attribute("gamma", INDEXED); + let schema = builder.build(); + + let mut buffer = Vec::new(); + schema.to_toml(&mut buffer)?; + + let schema2 = Schema::from_toml(buffer.as_slice())?; + assert_eq!(schema, schema2); + + let data = r#" + identifier = "id" + + [attributes."alpha"] + stored = true + + [attributes."beta"] + stored = true + indexed = true + + [attributes."gamma"] + indexed = true + "#; + let schema2 = Schema::from_toml(data.as_bytes())?; + assert_eq!(schema, schema2); + + Ok(()) + } + + #[test] + fn serialize_deserialize_json() -> Result<(), Box> { + let mut builder = SchemaBuilder::with_identifier("id"); + builder.new_attribute("alpha", STORED); + builder.new_attribute("beta", STORED | INDEXED); + builder.new_attribute("gamma", INDEXED); + let schema = builder.build(); + + let mut buffer = Vec::new(); + schema.to_json(&mut buffer)?; + + let schema2 = Schema::from_json(buffer.as_slice())?; + assert_eq!(schema, schema2); + + let data = r#" + { + "identifier": "id", + "attributes": { + "alpha": { + "stored": true + }, + "beta": { + "stored": true, + "indexed": true + }, + "gamma": { + "indexed": true + } + } + }"#; + let schema2 = Schema::from_json(data.as_bytes())?; + assert_eq!(schema, schema2); + + Ok(()) + } +} diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml index 8dc6f0db5..e8cdb8d56 100644 --- a/meilidb/Cargo.toml +++ b/meilidb/Cargo.toml @@ -6,6 +6,7 @@ authors = ["Kerollmops "] [dependencies] meilidb-core = { path = "../meilidb-core", version = "0.1.0" } +meilidb-data = { path = "../meilidb-data", version = "0.1.0" } meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } [features] diff --git a/meilidb/src/sort_by_attr.rs b/meilidb/src/sort_by_attr.rs index 24364aaf4..b7a1013fd 100644 --- a/meilidb/src/sort_by_attr.rs +++ b/meilidb/src/sort_by_attr.rs @@ -5,8 +5,8 @@ use std::fmt; use meilidb_core::criterion::Criterion; use meilidb_core::RawDocument; -use crate::database::schema::{Schema, SchemaAttr}; -use crate::database::RankedMap; +use meilidb_data::{Schema, SchemaAttr}; +use meilidb_data::RankedMap; /// An helper struct that permit to sort documents by /// some of their stored attributes. From e41c551757f67c4989652287a65b7269b776a9e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 29 Mar 2019 17:38:29 +0100 Subject: [PATCH 19/44] feat: Introduce the Number type --- meilidb-data/Cargo.toml | 1 + meilidb-data/src/lib.rs | 8 ++++-- meilidb-data/src/number.rs | 55 ++++++++++++++++++++++++++++++++++++++ meilidb-data/src/schema.rs | 2 -- 4 files changed, 62 insertions(+), 4 deletions(-) create mode 100644 meilidb-data/src/number.rs diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index f0046bc1a..18881367b 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -8,6 +8,7 @@ edition = "2018" bincode = "1.1.2" linked-hash-map = { version = "0.5.2", features = ["serde_impl"] } meilidb-core = { path = "../meilidb-core", version = "0.1.0" } +ordered-float = { version = "1.0.2", features = ["serde"] } serde = { version = "1.0.88", features = ["derive"] } serde_json = { version = "1.0.39", features = ["preserve_order"] } sled = "0.20.0" diff --git a/meilidb-data/src/lib.rs b/meilidb-data/src/lib.rs index a2f028ecd..96d6bdf6e 100644 --- a/meilidb-data/src/lib.rs +++ b/meilidb-data/src/lib.rs @@ -1,5 +1,9 @@ mod database; -mod schema; +pub mod schema; +mod ranked_map; +mod number; pub use self::database::{Database, Index}; -pub use self::schema::{Schema, SchemaAttr, SchemaBuilder}; +pub use self::schema::{Schema, SchemaAttr}; +pub use self::ranked_map::RankedMap; +pub use self::number::Number; diff --git a/meilidb-data/src/number.rs b/meilidb-data/src/number.rs new file mode 100644 index 000000000..9a2d0ea24 --- /dev/null +++ b/meilidb-data/src/number.rs @@ -0,0 +1,55 @@ +use std::num::{ParseIntError, ParseFloatError}; +use std::str::FromStr; +use std::fmt; + +use ordered_float::OrderedFloat; +use serde::{Serialize, Deserialize}; + +#[derive(Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Number { + Unsigned(u64), + Signed(i64), + Float(OrderedFloat), +} + +impl FromStr for Number { + type Err = ParseNumberError; + + fn from_str(s: &str) -> Result { + let uint_error = match u64::from_str(s) { + Ok(unsigned) => return Ok(Number::Unsigned(unsigned)), + Err(error) => error, + }; + + let int_error = match i64::from_str(s) { + Ok(signed) => return Ok(Number::Signed(signed)), + Err(error) => error, + }; + + let float_error = match f64::from_str(s) { + Ok(float) => return Ok(Number::Float(OrderedFloat(float))), + Err(error) => error, + }; + + Err(ParseNumberError { uint_error, int_error, float_error }) + } +} + +#[derive(Clone, PartialEq, Eq)] +pub struct ParseNumberError { + uint_error: ParseIntError, + int_error: ParseIntError, + float_error: ParseFloatError, +} + +impl fmt::Display for ParseNumberError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.uint_error == self.int_error { + write!(f, "can not parse number: {}, {}", self.uint_error, self.float_error) + } else { + write!(f, "can not parse number: {}, {}, {}", + self.uint_error, self.int_error, self.float_error) + } + } +} diff --git a/meilidb-data/src/schema.rs b/meilidb-data/src/schema.rs index c73b8b067..bff7806dc 100644 --- a/meilidb-data/src/schema.rs +++ b/meilidb-data/src/schema.rs @@ -8,8 +8,6 @@ use std::sync::Arc; use serde::{Serialize, Deserialize}; use linked_hash_map::LinkedHashMap; -use meilidb_core::DocumentId; - pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false }; pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false }; pub const RANKED: SchemaProps = SchemaProps { stored: false, indexed: false, ranked: true }; From f17a05c3425ef7a81c651d63938e9589e6f08af2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 29 Mar 2019 17:38:57 +0100 Subject: [PATCH 20/44] feat: Introduce the RankedMap type --- meilidb-data/Cargo.toml | 1 + meilidb-data/src/ranked_map.rs | 5 +++++ meilidb/src/sort_by_attr.rs | 4 +--- 3 files changed, 7 insertions(+), 3 deletions(-) create mode 100644 meilidb-data/src/ranked_map.rs diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index 18881367b..23749631b 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -6,6 +6,7 @@ edition = "2018" [dependencies] bincode = "1.1.2" +hashbrown = { version = "0.1.8", features = ["serde"] } linked-hash-map = { version = "0.5.2", features = ["serde_impl"] } meilidb-core = { path = "../meilidb-core", version = "0.1.0" } ordered-float = { version = "1.0.2", features = ["serde"] } diff --git a/meilidb-data/src/ranked_map.rs b/meilidb-data/src/ranked_map.rs new file mode 100644 index 000000000..7b4ff3735 --- /dev/null +++ b/meilidb-data/src/ranked_map.rs @@ -0,0 +1,5 @@ +use hashbrown::HashMap; +use meilidb_core::DocumentId; +use crate::{SchemaAttr, Number}; + +pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>; diff --git a/meilidb/src/sort_by_attr.rs b/meilidb/src/sort_by_attr.rs index b7a1013fd..f4c4bcc41 100644 --- a/meilidb/src/sort_by_attr.rs +++ b/meilidb/src/sort_by_attr.rs @@ -4,9 +4,7 @@ use std::fmt; use meilidb_core::criterion::Criterion; use meilidb_core::RawDocument; - -use meilidb_data::{Schema, SchemaAttr}; -use meilidb_data::RankedMap; +use meilidb_data::{Schema, SchemaAttr, RankedMap}; /// An helper struct that permit to sort documents by /// some of their stored attributes. From 9483f2df60d6e08b33610f6e0591b787ef9f4805 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 8 Apr 2019 15:19:57 +0200 Subject: [PATCH 21/44] feat: Introduce a custom Error type --- meilidb-data/Cargo.toml | 2 +- meilidb-data/src/database.rs | 85 ++++++++++++++++++++++++++++++++---- 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index 23749631b..a142c0bd5 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -12,5 +12,5 @@ meilidb-core = { path = "../meilidb-core", version = "0.1.0" } ordered-float = { version = "1.0.2", features = ["serde"] } serde = { version = "1.0.88", features = ["derive"] } serde_json = { version = "1.0.39", features = ["preserve_order"] } -sled = "0.20.0" +sled = "0.21.3" toml = { version = "0.5.0", features = ["preserve_order"] } diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 7f52dccda..f3a4d1dbd 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -1,21 +1,90 @@ -use std::path::Path; use std::sync::Arc; +use std::path::Path; + +use crate::schema::Schema; + +#[derive(Debug)] +pub enum Error { + SchemaNotFound, + SledError(sled::Error), + BincodeError(bincode::Error), +} + +impl From for Error { + fn from(error: sled::Error) -> Error { + Error::SledError(error) + } +} + +impl From for Error { + fn from(error: bincode::Error) -> Error { + Error::BincodeError(error) + } +} + +fn index_name(name: &str) -> Vec { + format!("index-{}", name).into_bytes() +} #[derive(Clone)] pub struct Database(sled::Db); impl Database { - pub fn start_default>(path: P) -> sled::Result { - sled::Db::start_default(path).map(Database) + pub fn start_default>(path: P) -> Result { + sled::Db::start_default(path).map(Database).map_err(Into::into) } - pub fn open_index(&self, name: &str) -> sled::Result { - let name = format!("index-{}", name); - let bytes = name.into_bytes(); + pub fn open_index(&self, name: &str) -> Result, Error> { + let name = index_name(name); - self.0.open_tree(bytes).map(Index) + if self.0.tree_names().into_iter().any(|tn| tn == name) { + let tree = self.0.open_tree(name)?; + let index = Index::from_raw(tree)?; + return Ok(Some(index)) + } + + Ok(None) + } + + pub fn create_index(&self, name: &str, schema: Schema) -> Result { + match self.open_index(name)? { + Some(index) => { + // TODO check if the schema is the same + Ok(index) + }, + None => { + let name = index_name(name); + let tree = self.0.open_tree(name)?; + let index = Index::new_from_raw(tree, schema)?; + Ok(index) + }, + } } } #[derive(Debug, Clone)] -pub struct Index(Arc); +pub struct Index { + schema: Schema, + inner: Arc, +} + +impl Index { + fn from_raw(inner: Arc) -> Result { + let bytes = inner.get("schema")?; + let bytes = bytes.ok_or(Error::SchemaNotFound)?; + + let schema = Schema::read_from_bin(bytes.as_ref())?; + Ok(Index { schema, inner }) + } + + fn new_from_raw(inner: Arc, schema: Schema) -> Result { + let mut schema_bytes = Vec::new(); + schema.write_to_bin(&mut schema_bytes); + inner.set("schema", schema_bytes)?; + Ok(Index { schema, inner }) + } + + pub fn schema(&self) -> &Schema { + &self.schema + } +} From 9be7c0246178689cbfc0cdd5034210ddf87b80d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 8 Apr 2019 16:06:36 +0200 Subject: [PATCH 22/44] chore: Update sled to 0.22.1 --- meilidb-data/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index a142c0bd5..d98b9c491 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -12,5 +12,5 @@ meilidb-core = { path = "../meilidb-core", version = "0.1.0" } ordered-float = { version = "1.0.2", features = ["serde"] } serde = { version = "1.0.88", features = ["derive"] } serde_json = { version = "1.0.39", features = ["preserve_order"] } -sled = "0.21.3" +sled = "0.22.1" toml = { version = "0.5.0", features = ["preserve_order"] } From f7eced03fd173ac0f61ff893a6ab38f87fac9d6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 8 Apr 2019 16:16:31 +0200 Subject: [PATCH 23/44] chore: Using a fork of the fst library that support Arc<[u8]> --- meilidb-core/Cargo.toml | 11 +++++++++-- meilidb-core/src/data/shared_data.rs | 16 +++++++++++++--- meilidb-core/src/shared_data_cursor.rs | 4 ++-- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index 233243016..f0e6dc6e0 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -6,10 +6,8 @@ edition = "2018" [dependencies] byteorder = "1.3.1" -fst = "0.3.3" hashbrown = "0.1.8" lazy_static = "1.2.0" -levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } log = "0.4.6" meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } rayon = "1.0.3" @@ -17,6 +15,15 @@ sdset = "0.3.1" serde = { version = "1.0.88", features = ["derive"] } slice-group-by = "0.2.4" +[dependencies.fst] +git = "https://github.com/Kerollmops/fst.git" +branch = "arc-byte-slice" + +[dependencies.levenshtein_automata] +git = "https://github.com/Kerollmops/levenshtein-automata.git" +branch = "arc-byte-slice" +features = ["fst_automaton"] + [features] i128 = ["byteorder/i128"] nightly = ["hashbrown/nightly", "slice-group-by/nightly"] diff --git a/meilidb-core/src/data/shared_data.rs b/meilidb-core/src/data/shared_data.rs index 100f837f7..fd505c6d9 100644 --- a/meilidb-core/src/data/shared_data.rs +++ b/meilidb-core/src/data/shared_data.rs @@ -1,9 +1,9 @@ use std::sync::Arc; use std::ops::Deref; -#[derive(Default, Clone)] +#[derive(Clone)] pub struct SharedData { - pub bytes: Arc>, + pub bytes: Arc<[u8]>, pub offset: usize, pub len: usize, } @@ -15,7 +15,7 @@ impl SharedData { SharedData::new(bytes, 0, len) } - pub fn new(bytes: Arc>, offset: usize, len: usize) -> SharedData { + pub fn new(bytes: Arc<[u8]>, offset: usize, len: usize) -> SharedData { SharedData { bytes, offset, len } } @@ -33,6 +33,16 @@ impl SharedData { } } +impl Default for SharedData { + fn default() -> SharedData { + SharedData { + bytes: Arc::from(Vec::new()), + offset: 0, + len: 0, + } + } +} + impl Deref for SharedData { type Target = [u8]; diff --git a/meilidb-core/src/shared_data_cursor.rs b/meilidb-core/src/shared_data_cursor.rs index 00d36884a..9eeac472f 100644 --- a/meilidb-core/src/shared_data_cursor.rs +++ b/meilidb-core/src/shared_data_cursor.rs @@ -7,12 +7,12 @@ pub struct SharedDataCursor(Cursor); impl SharedDataCursor { pub fn from_bytes(bytes: Vec) -> SharedDataCursor { let len = bytes.len(); - let bytes = Arc::new(bytes); + let bytes = Arc::from(bytes); SharedDataCursor::from_shared_bytes(bytes, 0, len) } - pub fn from_shared_bytes(bytes: Arc>, offset: usize, len: usize) -> SharedDataCursor { + pub fn from_shared_bytes(bytes: Arc<[u8]>, offset: usize, len: usize) -> SharedDataCursor { let data = SharedData::new(bytes, offset, len); let cursor = Cursor::new(data); From 58c020a2e1a94518b9c865b042e84ec0884d9ce6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 11 Apr 2019 14:51:17 +0200 Subject: [PATCH 24/44] feat: Store the word index into the database index --- meilidb-data/src/database.rs | 52 ++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index f3a4d1dbd..7cfc68f22 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -1,11 +1,17 @@ use std::sync::Arc; use std::path::Path; -use crate::schema::Schema; +use meilidb_core::Index as WordIndex; +use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor}; +use meilidb_core::write_to_bytes::WriteToBytes; +use sled::IVec; + +use crate::Schema; #[derive(Debug)] pub enum Error { - SchemaNotFound, + SchemaMissing, + WordIndexMissing, SledError(sled::Error), BincodeError(bincode::Error), } @@ -26,6 +32,13 @@ fn index_name(name: &str) -> Vec { format!("index-{}", name).into_bytes() } +fn ivec_into_arc(ivec: IVec) -> Arc<[u8]> { + match ivec { + IVec::Inline(len, bytes) => Arc::from(&bytes[..len as usize]), + IVec::Remote { buf } => buf, + } +} + #[derive(Clone)] pub struct Database(sled::Db); @@ -62,29 +75,52 @@ impl Database { } } -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct Index { schema: Schema, + word_index: Arc, inner: Arc, } impl Index { fn from_raw(inner: Arc) -> Result { let bytes = inner.get("schema")?; - let bytes = bytes.ok_or(Error::SchemaNotFound)?; - + let bytes = bytes.ok_or(Error::SchemaMissing)?; let schema = Schema::read_from_bin(bytes.as_ref())?; - Ok(Index { schema, inner }) + + let bytes = inner.get("word-index")?; + let bytes = bytes.ok_or(Error::WordIndexMissing)?; + let word_index = { + let len = bytes.len(); + let bytes = ivec_into_arc(bytes); + let mut cursor = SharedDataCursor::from_shared_bytes(bytes, 0, len); + + // TODO must handle this error + let word_index = WordIndex::from_shared_data_cursor(&mut cursor).unwrap(); + + Arc::new(word_index) + }; + + Ok(Index { schema, word_index, inner }) } fn new_from_raw(inner: Arc, schema: Schema) -> Result { let mut schema_bytes = Vec::new(); - schema.write_to_bin(&mut schema_bytes); + schema.write_to_bin(&mut schema_bytes)?; inner.set("schema", schema_bytes)?; - Ok(Index { schema, inner }) + + let word_index = WordIndex::default(); + inner.set("word-index", word_index.into_bytes())?; + let word_index = Arc::new(word_index); + + Ok(Index { schema, word_index, inner }) } pub fn schema(&self) -> &Schema { &self.schema } + + pub fn word_index(&self) -> &WordIndex { + &self.word_index + } } From 7338e522bdff893286b81ef37d3e274b46b57faa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 12 Apr 2019 18:46:36 +0200 Subject: [PATCH 25/44] squash-me: Add set/get/del_document_attribute to Index methods --- meilidb-data/src/database.rs | 47 ++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 7cfc68f22..41d343f05 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -1,12 +1,12 @@ use std::sync::Arc; use std::path::Path; -use meilidb_core::Index as WordIndex; +use meilidb_core::{DocumentId, Index as WordIndex}; use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor}; use meilidb_core::write_to_bytes::WriteToBytes; use sled::IVec; -use crate::Schema; +use crate::{Schema, SchemaAttr}; #[derive(Debug)] pub enum Error { @@ -32,6 +32,17 @@ fn index_name(name: &str) -> Vec { format!("index-{}", name).into_bytes() } +fn document_key(id: DocumentId, attr: SchemaAttr) -> Vec { + let DocumentId(document_id) = id; + let SchemaAttr(schema_attr) = attr; + + let mut bytes = Vec::new(); + bytes.extend_from_slice(b"document-"); + bytes.extend_from_slice(&document_id.to_be_bytes()[..]); + bytes.extend_from_slice(&schema_attr.to_be_bytes()[..]); + bytes +} + fn ivec_into_arc(ivec: IVec) -> Arc<[u8]> { match ivec { IVec::Inline(len, bytes) => Arc::from(&bytes[..len as usize]), @@ -123,4 +134,36 @@ impl Index { pub fn word_index(&self) -> &WordIndex { &self.word_index } + + pub fn set_document_attribute( + &self, + id: DocumentId, + attr: SchemaAttr, + value: V, + ) -> Result, Error> + where IVec: From, + { + let key = document_key(id, attr); + Ok(self.inner.set(key, value)?) + } + + pub fn get_document_attribute( + &self, + id: DocumentId, + attr: SchemaAttr + ) -> Result, Error> + { + let key = document_key(id, attr); + Ok(self.inner.get(key)?) + } + + pub fn del_document_attribute( + &self, + id: DocumentId, + attr: SchemaAttr + ) -> Result, Error> + { + let key = document_key(id, attr); + Ok(self.inner.del(key)?) + } } From 25a4961453c1057da2575bae71db591ee43b7b10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 15 Apr 2019 15:16:53 +0200 Subject: [PATCH 26/44] feat: Introduce the Indexer struct --- meilidb-data/Cargo.toml | 2 + meilidb-data/src/indexer.rs | 84 +++++++++++++++++++++++++++++++++++++ meilidb-data/src/lib.rs | 10 +++-- 3 files changed, 92 insertions(+), 4 deletions(-) create mode 100644 meilidb-data/src/indexer.rs diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index d98b9c491..7c13e9f72 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -9,7 +9,9 @@ bincode = "1.1.2" hashbrown = { version = "0.1.8", features = ["serde"] } linked-hash-map = { version = "0.5.2", features = ["serde_impl"] } meilidb-core = { path = "../meilidb-core", version = "0.1.0" } +meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } ordered-float = { version = "1.0.2", features = ["serde"] } +sdset = "0.3.1" serde = { version = "1.0.88", features = ["derive"] } serde_json = { version = "1.0.39", features = ["preserve_order"] } sled = "0.22.1" diff --git a/meilidb-data/src/indexer.rs b/meilidb-data/src/indexer.rs new file mode 100644 index 000000000..82a4ae156 --- /dev/null +++ b/meilidb-data/src/indexer.rs @@ -0,0 +1,84 @@ +use std::collections::BTreeMap; +use std::convert::TryFrom; + +use meilidb_core::{DocumentId, DocIndex}; +use meilidb_core::{Index as WordIndex, IndexBuilder as WordIndexBuilder}; +use meilidb_tokenizer::{Tokenizer, SeqTokenizer, Token}; +use crate::SchemaAttr; + +use sdset::Set; + +type Word = Vec; // TODO make it be a SmallVec + +pub struct Indexer { + word_limit: usize, // the maximum number of indexed words + indexed: BTreeMap>, +} + +impl Indexer { + pub fn new() -> Indexer { + Indexer { + word_limit: 1000, + indexed: BTreeMap::new(), + } + } + + pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) { + for token in Tokenizer::new(text) { + if token.word_index >= self.word_limit { break } + let docindex = match token_to_docindex(id, attr, token) { + Some(docindex) => docindex, + None => break, + }; + + let word = Vec::from(token.word); + self.indexed.entry(word).or_insert_with(Vec::new).push(docindex); + } + } + + pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I) + where I: IntoIterator, + { + let iter = iter.into_iter(); + for token in SeqTokenizer::new(iter) { + if token.word_index >= self.word_limit { break } + let docindex = match token_to_docindex(id, attr, token) { + Some(docindex) => docindex, + None => break, + }; + + let word = Vec::from(token.word); + self.indexed.entry(word).or_insert_with(Vec::new).push(docindex); + } + } + + pub fn build(self) -> WordIndex { + let mut builder = WordIndexBuilder::new(); + + for (key, mut indexes) in self.indexed { + indexes.sort_unstable(); + indexes.dedup(); + + let indexes = Set::new_unchecked(&indexes); + builder.insert(key, indexes).unwrap(); + } + + builder.build() + } +} + +fn token_to_docindex<'a>(id: DocumentId, attr: SchemaAttr, token: Token<'a>) -> Option { + let word_index = u16::try_from(token.word_index).ok()?; + let char_index = u16::try_from(token.char_index).ok()?; + let char_length = u16::try_from(token.word.chars().count()).ok()?; + + let docindex = DocIndex { + document_id: id, + attribute: attr.0, + word_index: word_index, + char_index: char_index, + char_length: char_length, + }; + + Some(docindex) +} diff --git a/meilidb-data/src/lib.rs b/meilidb-data/src/lib.rs index 96d6bdf6e..c601105ed 100644 --- a/meilidb-data/src/lib.rs +++ b/meilidb-data/src/lib.rs @@ -1,9 +1,11 @@ mod database; -pub mod schema; -mod ranked_map; +mod indexer; mod number; +mod ranked_map; +pub mod schema; pub use self::database::{Database, Index}; -pub use self::schema::{Schema, SchemaAttr}; -pub use self::ranked_map::RankedMap; pub use self::number::Number; +pub use self::ranked_map::RankedMap; +pub use self::schema::{Schema, SchemaAttr}; +pub use self::indexer::Indexer; From 0104e93ba9539e3f93493643f55b56a4da63c096 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 15 Apr 2019 16:07:41 +0200 Subject: [PATCH 27/44] feat: Introduce index events to update the WordIndex --- meilidb-data/Cargo.toml | 1 + meilidb-data/src/index_event.rs | 45 +++++++++++++++++++++++++++++++++ meilidb-data/src/lib.rs | 2 +- 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 meilidb-data/src/index_event.rs diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index 7c13e9f72..18b8ad1e6 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -6,6 +6,7 @@ edition = "2018" [dependencies] bincode = "1.1.2" +byteorder = "1.3.1" hashbrown = { version = "0.1.8", features = ["serde"] } linked-hash-map = { version = "0.5.2", features = ["serde_impl"] } meilidb-core = { path = "../meilidb-core", version = "0.1.0" } diff --git a/meilidb-data/src/index_event.rs b/meilidb-data/src/index_event.rs new file mode 100644 index 000000000..40d54cbf3 --- /dev/null +++ b/meilidb-data/src/index_event.rs @@ -0,0 +1,45 @@ +use std::error::Error; + +use byteorder::{ReadBytesExt, WriteBytesExt}; + +use meilidb_core::{Index as WordIndex}; +use meilidb_core::data::DocIds; +use meilidb_core::write_to_bytes::WriteToBytes; +use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; + +enum NewIndexEvent<'a> { + RemovedDocuments(&'a DocIds), + UpdatedDocuments(&'a WordIndex), +} + +impl<'a> WriteToBytes for NewIndexEvent<'a> { + fn write_to_bytes(&self, bytes: &mut Vec) { + match self { + NewIndexEvent::RemovedDocuments(doc_ids) => { + let _ = bytes.write_u8(0); + doc_ids.write_to_bytes(bytes); + }, + NewIndexEvent::UpdatedDocuments(index) => { + let _ = bytes.write_u8(1); + index.write_to_bytes(bytes); + } + } + } +} + +enum IndexEvent { + RemovedDocuments(DocIds), + UpdatedDocuments(WordIndex), +} + +impl FromSharedDataCursor for IndexEvent { + type Error = Box; + + fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result { + match cursor.read_u8()? { + 0 => DocIds::from_shared_data_cursor(cursor).map(IndexEvent::RemovedDocuments), + 1 => WordIndex::from_shared_data_cursor(cursor).map(IndexEvent::UpdatedDocuments), + _ => Err("invalid index event type".into()), + } + } +} diff --git a/meilidb-data/src/lib.rs b/meilidb-data/src/lib.rs index c601105ed..a19418a3f 100644 --- a/meilidb-data/src/lib.rs +++ b/meilidb-data/src/lib.rs @@ -1,4 +1,5 @@ mod database; +mod index_event; mod indexer; mod number; mod ranked_map; @@ -8,4 +9,3 @@ pub use self::database::{Database, Index}; pub use self::number::Number; pub use self::ranked_map::RankedMap; pub use self::schema::{Schema, SchemaAttr}; -pub use self::indexer::Indexer; From b7805fee93bc9ac9e23a960d07da29bd61de2faf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 16 Apr 2019 10:47:52 +0200 Subject: [PATCH 28/44] feat: Store already opened indexes and word indexes --- meilidb-data/Cargo.toml | 1 + meilidb-data/src/database.rs | 67 +++++++++++++++++++++++++++--------- 2 files changed, 51 insertions(+), 17 deletions(-) diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index 18b8ad1e6..c3dc2dc05 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -5,6 +5,7 @@ authors = ["Kerollmops "] edition = "2018" [dependencies] +arc-swap = "0.3.11" bincode = "1.1.2" byteorder = "1.3.1" hashbrown = { version = "0.1.8", features = ["serde"] } diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 41d343f05..006612a1e 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -1,15 +1,18 @@ -use std::sync::Arc; use std::path::Path; +use std::sync::Arc; -use meilidb_core::{DocumentId, Index as WordIndex}; +use arc_swap::{ArcSwap, Lease}; +use hashbrown::HashMap; use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor}; use meilidb_core::write_to_bytes::WriteToBytes; +use meilidb_core::{DocumentId, Index as WordIndex}; use sled::IVec; use crate::{Schema, SchemaAttr}; #[derive(Debug)] pub enum Error { + SchemaDiffer, SchemaMissing, WordIndexMissing, SledError(sled::Error), @@ -51,35 +54,61 @@ fn ivec_into_arc(ivec: IVec) -> Arc<[u8]> { } #[derive(Clone)] -pub struct Database(sled::Db); +pub struct Database { + opened: Arc>>, + inner: sled::Db, +} impl Database { pub fn start_default>(path: P) -> Result { - sled::Db::start_default(path).map(Database).map_err(Into::into) + let inner = sled::Db::start_default(path)?; + let opened = Arc::new(ArcSwap::new(Arc::new(HashMap::new()))); + Ok(Database { opened, inner }) } pub fn open_index(&self, name: &str) -> Result, Error> { - let name = index_name(name); + // check if the index was already opened + if let Some(index) = self.opened.lease().get(name) { + return Ok(Some(index.clone())) + } - if self.0.tree_names().into_iter().any(|tn| tn == name) { - let tree = self.0.open_tree(name)?; + let raw_name = index_name(name); + if self.inner.tree_names().into_iter().any(|tn| tn == raw_name) { + let tree = self.inner.open_tree(raw_name)?; let index = Index::from_raw(tree)?; + + self.opened.rcu(|opened| { + let mut opened = HashMap::clone(opened); + opened.insert(name.to_string(), index.clone()); + opened + }); + return Ok(Some(index)) } Ok(None) } - pub fn create_index(&self, name: &str, schema: Schema) -> Result { - match self.open_index(name)? { + pub fn create_index(&self, name: String, schema: Schema) -> Result { + match self.open_index(&name)? { Some(index) => { - // TODO check if the schema is the same + if index.schema != schema { + return Err(Error::SchemaDiffer); + } + Ok(index) }, None => { - let name = index_name(name); - let tree = self.0.open_tree(name)?; + let raw_name = index_name(&name); + let tree = self.inner.open_tree(raw_name)?; let index = Index::new_from_raw(tree, schema)?; + + self.opened.rcu(|opened| { + let mut opened = HashMap::clone(opened); + opened.insert(name.clone(), index.clone()); + opened + }); + Ok(index) }, } @@ -89,7 +118,7 @@ impl Database { #[derive(Clone)] pub struct Index { schema: Schema, - word_index: Arc, + word_index: Arc>, inner: Arc, } @@ -109,7 +138,7 @@ impl Index { // TODO must handle this error let word_index = WordIndex::from_shared_data_cursor(&mut cursor).unwrap(); - Arc::new(word_index) + Arc::new(ArcSwap::new(Arc::new(word_index))) }; Ok(Index { schema, word_index, inner }) @@ -122,7 +151,7 @@ impl Index { let word_index = WordIndex::default(); inner.set("word-index", word_index.into_bytes())?; - let word_index = Arc::new(word_index); + let word_index = Arc::new(ArcSwap::new(Arc::new(word_index))); Ok(Index { schema, word_index, inner }) } @@ -131,8 +160,12 @@ impl Index { &self.schema } - pub fn word_index(&self) -> &WordIndex { - &self.word_index + pub fn word_index(&self) -> Lease> { + self.word_index.lease() + } + + fn update_word_index(&self, word_index: Arc) { + self.word_index.store(word_index) } pub fn set_document_attribute( From ee2bad20c79a7846388115c981379985f1b8332e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 16 Apr 2019 12:06:40 +0200 Subject: [PATCH 29/44] feat: Store the RankedMap into the inner sled tree --- meilidb-data/src/database.rs | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 006612a1e..baacddc35 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -8,7 +8,7 @@ use meilidb_core::write_to_bytes::WriteToBytes; use meilidb_core::{DocumentId, Index as WordIndex}; use sled::IVec; -use crate::{Schema, SchemaAttr}; +use crate::{Schema, SchemaAttr, RankedMap}; #[derive(Debug)] pub enum Error { @@ -119,14 +119,17 @@ impl Database { pub struct Index { schema: Schema, word_index: Arc>, + ranked_map: Arc>, inner: Arc, } impl Index { fn from_raw(inner: Arc) -> Result { - let bytes = inner.get("schema")?; - let bytes = bytes.ok_or(Error::SchemaMissing)?; - let schema = Schema::read_from_bin(bytes.as_ref())?; + let schema = { + let bytes = inner.get("schema")?; + let bytes = bytes.ok_or(Error::SchemaMissing)?; + Schema::read_from_bin(bytes.as_ref())? + }; let bytes = inner.get("word-index")?; let bytes = bytes.ok_or(Error::WordIndexMissing)?; @@ -141,7 +144,16 @@ impl Index { Arc::new(ArcSwap::new(Arc::new(word_index))) }; - Ok(Index { schema, word_index, inner }) + let ranked_map = { + let map = match inner.get("ranked-map")? { + Some(bytes) => bincode::deserialize(bytes.as_ref())?, + None => RankedMap::default(), + }; + + Arc::new(ArcSwap::new(Arc::new(map))) + }; + + Ok(Index { schema, word_index, ranked_map, inner }) } fn new_from_raw(inner: Arc, schema: Schema) -> Result { @@ -153,7 +165,9 @@ impl Index { inner.set("word-index", word_index.into_bytes())?; let word_index = Arc::new(ArcSwap::new(Arc::new(word_index))); - Ok(Index { schema, word_index, inner }) + let ranked_map = Arc::new(ArcSwap::new(Arc::new(RankedMap::default()))); + + Ok(Index { schema, word_index, ranked_map, inner }) } pub fn schema(&self) -> &Schema { @@ -164,10 +178,18 @@ impl Index { self.word_index.lease() } + pub fn ranked_map(&self) -> Lease> { + self.ranked_map.lease() + } + fn update_word_index(&self, word_index: Arc) { self.word_index.store(word_index) } + fn update_ranked_map(&self, ranked_map: Arc) { + self.ranked_map.store(ranked_map) + } + pub fn set_document_attribute( &self, id: DocumentId, From 4b40d5b0d4fc722c4a0b08032bdb825d0909f557 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 18 Apr 2019 13:58:35 +0200 Subject: [PATCH 30/44] feat: Introduce the Index struct --- meilidb-core/Cargo.toml | 2 +- meilidb-data/Cargo.toml | 8 +- meilidb-data/src/database.rs | 227 ++++++++++++++++++++++++++++++++--- 3 files changed, 216 insertions(+), 21 deletions(-) diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index f0e6dc6e0..16bc204d4 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -6,7 +6,7 @@ edition = "2018" [dependencies] byteorder = "1.3.1" -hashbrown = "0.1.8" +hashbrown = "0.2.2" lazy_static = "1.2.0" log = "0.4.6" meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index c3dc2dc05..e6fca8c66 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -8,13 +8,17 @@ edition = "2018" arc-swap = "0.3.11" bincode = "1.1.2" byteorder = "1.3.1" -hashbrown = { version = "0.1.8", features = ["serde"] } +hashbrown = { version = "0.2.2", features = ["serde"] } linked-hash-map = { version = "0.5.2", features = ["serde_impl"] } meilidb-core = { path = "../meilidb-core", version = "0.1.0" } meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } ordered-float = { version = "1.0.2", features = ["serde"] } sdset = "0.3.1" -serde = { version = "1.0.88", features = ["derive"] } +serde = { version = "1.0.90", features = ["derive"] } serde_json = { version = "1.0.39", features = ["preserve_order"] } sled = "0.22.1" toml = { version = "0.5.0", features = ["preserve_order"] } + +[dependencies.rmp-serde] +git = "https://github.com/3Hren/msgpack-rust.git" +rev = "40b3d48" diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index baacddc35..6aac48b48 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -1,3 +1,6 @@ +use std::collections::HashSet; +use std::io::{self, Cursor, BufRead}; +use std::iter::FromIterator; use std::path::Path; use std::sync::Arc; @@ -6,7 +9,11 @@ use hashbrown::HashMap; use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor}; use meilidb_core::write_to_bytes::WriteToBytes; use meilidb_core::{DocumentId, Index as WordIndex}; +use rmp_serde::decode::{Deserializer as RmpDeserializer, ReadReader}; +use rmp_serde::decode::{Error as RmpError}; +use serde::{de, forward_to_deserialize_any}; use sled::IVec; +use byteorder::{ReadBytesExt, BigEndian}; use crate::{Schema, SchemaAttr, RankedMap}; @@ -46,6 +53,37 @@ fn document_key(id: DocumentId, attr: SchemaAttr) -> Vec { bytes } +trait CursorExt { + fn consume_if_eq(&mut self, needle: &[u8]) -> bool; +} + +impl> CursorExt for Cursor { + fn consume_if_eq(&mut self, needle: &[u8]) -> bool { + let position = self.position() as usize; + let slice = self.get_ref().as_ref(); + + if slice[position..].starts_with(needle) { + self.consume(needle.len()); + true + } else { + false + } + } +} + +fn extract_document_key(key: Vec) -> io::Result<(DocumentId, SchemaAttr)> { + let mut key = Cursor::new(key); + + if !key.consume_if_eq(b"document-") { + return Err(io::Error::from(io::ErrorKind::InvalidData)) + } + + let document_id = key.read_u64::().map(DocumentId)?; + let schema_attr = key.read_u16::().map(SchemaAttr)?; + + Ok((document_id, schema_attr)) +} + fn ivec_into_arc(ivec: IVec) -> Arc<[u8]> { match ivec { IVec::Inline(len, bytes) => Arc::from(&bytes[..len as usize]), @@ -55,7 +93,7 @@ fn ivec_into_arc(ivec: IVec) -> Arc<[u8]> { #[derive(Clone)] pub struct Database { - opened: Arc>>, + opened: Arc>>, inner: sled::Db, } @@ -68,22 +106,22 @@ impl Database { pub fn open_index(&self, name: &str) -> Result, Error> { // check if the index was already opened - if let Some(index) = self.opened.lease().get(name) { - return Ok(Some(index.clone())) + if let Some(raw_index) = self.opened.lease().get(name) { + return Ok(Some(Index(raw_index.clone()))) } let raw_name = index_name(name); if self.inner.tree_names().into_iter().any(|tn| tn == raw_name) { let tree = self.inner.open_tree(raw_name)?; - let index = Index::from_raw(tree)?; + let raw_index = RawIndex::from_raw(tree)?; self.opened.rcu(|opened| { let mut opened = HashMap::clone(opened); - opened.insert(name.to_string(), index.clone()); + opened.insert(name.to_string(), raw_index.clone()); opened }); - return Ok(Some(index)) + return Ok(Some(Index(raw_index))) } Ok(None) @@ -92,7 +130,7 @@ impl Database { pub fn create_index(&self, name: String, schema: Schema) -> Result { match self.open_index(&name)? { Some(index) => { - if index.schema != schema { + if index.schema() != &schema { return Err(Error::SchemaDiffer); } @@ -101,30 +139,30 @@ impl Database { None => { let raw_name = index_name(&name); let tree = self.inner.open_tree(raw_name)?; - let index = Index::new_from_raw(tree, schema)?; + let raw_index = RawIndex::new_from_raw(tree, schema)?; self.opened.rcu(|opened| { let mut opened = HashMap::clone(opened); - opened.insert(name.clone(), index.clone()); + opened.insert(name.clone(), raw_index.clone()); opened }); - Ok(index) + Ok(Index(raw_index)) }, } } } #[derive(Clone)] -pub struct Index { +pub struct RawIndex { schema: Schema, word_index: Arc>, ranked_map: Arc>, inner: Arc, } -impl Index { - fn from_raw(inner: Arc) -> Result { +impl RawIndex { + fn from_raw(inner: Arc) -> Result { let schema = { let bytes = inner.get("schema")?; let bytes = bytes.ok_or(Error::SchemaMissing)?; @@ -153,10 +191,10 @@ impl Index { Arc::new(ArcSwap::new(Arc::new(map))) }; - Ok(Index { schema, word_index, ranked_map, inner }) + Ok(RawIndex { schema, word_index, ranked_map, inner }) } - fn new_from_raw(inner: Arc, schema: Schema) -> Result { + fn new_from_raw(inner: Arc, schema: Schema) -> Result { let mut schema_bytes = Vec::new(); schema.write_to_bin(&mut schema_bytes)?; inner.set("schema", schema_bytes)?; @@ -167,7 +205,7 @@ impl Index { let ranked_map = Arc::new(ArcSwap::new(Arc::new(RankedMap::default()))); - Ok(Index { schema, word_index, ranked_map, inner }) + Ok(RawIndex { schema, word_index, ranked_map, inner }) } pub fn schema(&self) -> &Schema { @@ -182,11 +220,11 @@ impl Index { self.ranked_map.lease() } - fn update_word_index(&self, word_index: Arc) { + pub fn update_word_index(&self, word_index: Arc) { self.word_index.store(word_index) } - fn update_ranked_map(&self, ranked_map: Arc) { + pub fn update_ranked_map(&self, ranked_map: Arc) { self.ranked_map.store(ranked_map) } @@ -212,6 +250,12 @@ impl Index { Ok(self.inner.get(key)?) } + pub fn get_document_fields(&self, id: DocumentId) -> DocumentFieldsIter { + let start = document_key(id, SchemaAttr::min()); + let end = document_key(id, SchemaAttr::max()); + DocumentFieldsIter(self.inner.range(start..=end)) + } + pub fn del_document_attribute( &self, id: DocumentId, @@ -222,3 +266,150 @@ impl Index { Ok(self.inner.del(key)?) } } + +pub struct DocumentFieldsIter<'a>(sled::Iter<'a>); + +impl<'a> Iterator for DocumentFieldsIter<'a> { + type Item = Result<(DocumentId, SchemaAttr, IVec), Error>; + + fn next(&mut self) -> Option { + match self.0.next() { + Some(Ok((key, value))) => { + let (id, attr) = extract_document_key(key).unwrap(); + Some(Ok((id, attr, value))) + }, + Some(Err(e)) => Some(Err(Error::SledError(e))), + None => None, + } + } +} + +#[derive(Clone)] +pub struct Index(RawIndex); + +impl Index { + pub fn schema(&self) -> &Schema { + self.0.schema() + } + + pub fn word_index(&self) -> Lease> { + self.0.word_index() + } + + pub fn ranked_map(&self) -> Lease> { + self.0.ranked_map() + } + + pub fn document( + &self, + fields: Option<&HashSet<&str>>, + id: DocumentId, + ) -> Result, RmpError> + where T: de::DeserializeOwned, + { + let fields = match fields { + Some(fields) => { + let iter = fields.iter().filter_map(|n| self.0.schema().attribute(n)); + Some(HashSet::from_iter(iter)) + }, + None => None, + }; + + let mut deserializer = Deserializer { + document_id: id, + raw_index: &self.0, + fields: fields.as_ref(), + }; + + // TODO: currently we return an error if all document fields are missing, + // returning None would have been better + T::deserialize(&mut deserializer).map(Some) + } +} + +struct Deserializer<'a> { + document_id: DocumentId, + raw_index: &'a RawIndex, + fields: Option<&'a HashSet>, +} + +impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> +{ + type Error = RmpError; + + fn deserialize_any(self, visitor: V) -> Result + where V: de::Visitor<'de> + { + self.deserialize_map(visitor) + } + + forward_to_deserialize_any! { + bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit seq + bytes byte_buf unit_struct tuple_struct + identifier tuple ignored_any option newtype_struct enum struct + } + + fn deserialize_map(self, visitor: V) -> Result + where V: de::Visitor<'de> + { + let document_attributes = self.raw_index.get_document_fields(self.document_id); + let document_attributes = document_attributes.filter_map(|result| { + match result { + Ok(value) => Some(value), + Err(e) => { + // TODO: must log the error + // error!("sled iter error; {}", e); + None + }, + } + }); + let iter = document_attributes.filter_map(|(_, attr, value)| { + if self.fields.map_or(true, |f| f.contains(&attr)) { + let attribute_name = self.raw_index.schema.attribute_name(attr); + Some((attribute_name, Value::new(value))) + } else { + None + } + }); + + let map_deserializer = de::value::MapDeserializer::new(iter); + visitor.visit_map(map_deserializer) + } +} + +struct Value(RmpDeserializer>>) where A: AsRef<[u8]>; + +impl Value where A: AsRef<[u8]> +{ + fn new(value: A) -> Value { + Value(RmpDeserializer::new(Cursor::new(value))) + } +} + +impl<'de, A> de::IntoDeserializer<'de, RmpError> for Value +where A: AsRef<[u8]>, +{ + type Deserializer = Self; + + fn into_deserializer(self) -> Self::Deserializer { + self + } +} + +impl<'de, 'a, A> de::Deserializer<'de> for Value +where A: AsRef<[u8]>, +{ + type Error = RmpError; + + fn deserialize_any(mut self, visitor: V) -> Result + where V: de::Visitor<'de> + { + self.0.deserialize_any(visitor) + } + + forward_to_deserialize_any! { + bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string + bytes byte_buf option unit unit_struct newtype_struct seq tuple + tuple_struct map struct enum identifier ignored_any + } +} From 187e6740bd1325ba2d4686ed25749d0feb9205a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 18 Apr 2019 14:11:00 +0200 Subject: [PATCH 31/44] feat: Allow users to construct query builders from database indexes --- meilidb-core/src/query_builder.rs | 44 ++++++++++++++++++------------- meilidb-data/src/database.rs | 16 +++++++++++ 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index d5ec79a50..4d2327871 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -1,8 +1,8 @@ -use std::{cmp, mem}; -use std::ops::Range; -use std::time::Instant; use std::hash::Hash; +use std::ops::{Range, Deref}; use std::rc::Rc; +use std::time::Instant; +use std::{cmp, mem}; use rayon::slice::ParallelSliceMut; use slice_group_by::GroupByMut; @@ -35,26 +35,26 @@ fn generate_automatons(query: &str) -> Vec { automatons } -pub struct QueryBuilder<'i, 'c, FI = fn(DocumentId) -> bool> { - index: &'i Index, +pub struct QueryBuilder<'c, I, FI = fn(DocumentId) -> bool> { + index: I, criteria: Criteria<'c>, searchable_attrs: Option>, filter: Option, } -impl<'i, 'c> QueryBuilder<'i, 'c, fn(DocumentId) -> bool> { - pub fn new(index: &'i Index) -> Self { +impl<'c, I> QueryBuilder<'c, I, fn(DocumentId) -> bool> { + pub fn new(index: I) -> Self { QueryBuilder::with_criteria(index, Criteria::default()) } - pub fn with_criteria(index: &'i Index, criteria: Criteria<'c>) -> Self { + pub fn with_criteria(index: I, criteria: Criteria<'c>) -> Self { QueryBuilder { index, criteria, searchable_attrs: None, filter: None } } } -impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> +impl<'c, I, FI> QueryBuilder<'c, I, FI> { - pub fn with_filter(self, function: F) -> QueryBuilder<'i, 'c, F> + pub fn with_filter(self, function: F) -> QueryBuilder<'c, I, F> where F: Fn(DocumentId) -> bool, { QueryBuilder { @@ -65,7 +65,7 @@ impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> } } - pub fn with_distinct(self, function: F, size: usize) -> DistinctQueryBuilder<'i, 'c, FI, F> + pub fn with_distinct(self, function: F, size: usize) -> DistinctQueryBuilder<'c, I, FI, F> where F: Fn(DocumentId) -> Option, K: Hash + Eq, { @@ -80,7 +80,11 @@ impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> let attributes = self.searchable_attrs.get_or_insert_with(HashSet::new); attributes.insert(attribute); } +} +impl<'c, I, FI> QueryBuilder<'c, I, FI> +where I: Deref, +{ fn query_all(&self, query: &str) -> Vec { let automatons = generate_automatons(query); @@ -131,8 +135,9 @@ impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> } } -impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> -where FI: Fn(DocumentId) -> bool, +impl<'c, I, FI> QueryBuilder<'c, I, FI> +where I: Deref, + FI: Fn(DocumentId) -> bool, { pub fn query(self, query: &str, range: Range) -> Vec { // We delegate the filter work to the distinct query builder, @@ -184,15 +189,15 @@ where FI: Fn(DocumentId) -> bool, } } -pub struct DistinctQueryBuilder<'i, 'c, FI, FD> { - inner: QueryBuilder<'i, 'c, FI>, +pub struct DistinctQueryBuilder<'c, I, FI, FD> { + inner: QueryBuilder<'c, I, FI>, function: FD, size: usize, } -impl<'i, 'c, FI, FD> DistinctQueryBuilder<'i, 'c, FI, FD> +impl<'c, I, FI, FD> DistinctQueryBuilder<'c, I, FI, FD> { - pub fn with_filter(self, function: F) -> DistinctQueryBuilder<'i, 'c, F, FD> + pub fn with_filter(self, function: F) -> DistinctQueryBuilder<'c, I, F, FD> where F: Fn(DocumentId) -> bool, { DistinctQueryBuilder { @@ -207,8 +212,9 @@ impl<'i, 'c, FI, FD> DistinctQueryBuilder<'i, 'c, FI, FD> } } -impl<'i, 'c, FI, FD, K> DistinctQueryBuilder<'i, 'c, FI, FD> -where FI: Fn(DocumentId) -> bool, +impl<'c, I, FI, FD, K> DistinctQueryBuilder<'c, I, FI, FD> +where I: Deref, + FI: Fn(DocumentId) -> bool, FD: Fn(DocumentId) -> Option, K: Hash + Eq, { diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 6aac48b48..44babb55d 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -6,6 +6,8 @@ use std::sync::Arc; use arc_swap::{ArcSwap, Lease}; use hashbrown::HashMap; +use meilidb_core::criterion::Criteria; +use meilidb_core::QueryBuilder; use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor}; use meilidb_core::write_to_bytes::WriteToBytes; use meilidb_core::{DocumentId, Index as WordIndex}; @@ -288,6 +290,20 @@ impl<'a> Iterator for DocumentFieldsIter<'a> { pub struct Index(RawIndex); impl Index { + pub fn query_builder(&self) -> QueryBuilder>> { + let word_index = self.word_index(); + QueryBuilder::new(word_index) + } + + pub fn query_builder_with_criteria<'c>( + &self, + criteria: Criteria<'c>, + ) -> QueryBuilder<'c, Lease>> + { + let word_index = self.word_index(); + QueryBuilder::with_criteria(word_index, criteria) + } + pub fn schema(&self) -> &Schema { self.0.schema() } From 725e7b42294b8f01ea809d01ddb68ec2c478a103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 18 Apr 2019 14:23:09 +0200 Subject: [PATCH 32/44] chore: Move the Deserializer into the the serde module --- meilidb-data/src/database.rs | 93 +----------------------- meilidb-data/src/lib.rs | 1 + meilidb-data/src/serde/deserializer.rs | 97 ++++++++++++++++++++++++++ meilidb-data/src/serde/mod.rs | 3 + 4 files changed, 104 insertions(+), 90 deletions(-) create mode 100644 meilidb-data/src/serde/deserializer.rs create mode 100644 meilidb-data/src/serde/mod.rs diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 44babb55d..927d22e21 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -5,19 +5,19 @@ use std::path::Path; use std::sync::Arc; use arc_swap::{ArcSwap, Lease}; +use byteorder::{ReadBytesExt, BigEndian}; use hashbrown::HashMap; use meilidb_core::criterion::Criteria; use meilidb_core::QueryBuilder; use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor}; use meilidb_core::write_to_bytes::WriteToBytes; use meilidb_core::{DocumentId, Index as WordIndex}; -use rmp_serde::decode::{Deserializer as RmpDeserializer, ReadReader}; use rmp_serde::decode::{Error as RmpError}; -use serde::{de, forward_to_deserialize_any}; +use serde::de; use sled::IVec; -use byteorder::{ReadBytesExt, BigEndian}; use crate::{Schema, SchemaAttr, RankedMap}; +use crate::serde::Deserializer; #[derive(Debug)] pub enum Error { @@ -342,90 +342,3 @@ impl Index { T::deserialize(&mut deserializer).map(Some) } } - -struct Deserializer<'a> { - document_id: DocumentId, - raw_index: &'a RawIndex, - fields: Option<&'a HashSet>, -} - -impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> -{ - type Error = RmpError; - - fn deserialize_any(self, visitor: V) -> Result - where V: de::Visitor<'de> - { - self.deserialize_map(visitor) - } - - forward_to_deserialize_any! { - bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit seq - bytes byte_buf unit_struct tuple_struct - identifier tuple ignored_any option newtype_struct enum struct - } - - fn deserialize_map(self, visitor: V) -> Result - where V: de::Visitor<'de> - { - let document_attributes = self.raw_index.get_document_fields(self.document_id); - let document_attributes = document_attributes.filter_map(|result| { - match result { - Ok(value) => Some(value), - Err(e) => { - // TODO: must log the error - // error!("sled iter error; {}", e); - None - }, - } - }); - let iter = document_attributes.filter_map(|(_, attr, value)| { - if self.fields.map_or(true, |f| f.contains(&attr)) { - let attribute_name = self.raw_index.schema.attribute_name(attr); - Some((attribute_name, Value::new(value))) - } else { - None - } - }); - - let map_deserializer = de::value::MapDeserializer::new(iter); - visitor.visit_map(map_deserializer) - } -} - -struct Value(RmpDeserializer>>) where A: AsRef<[u8]>; - -impl Value where A: AsRef<[u8]> -{ - fn new(value: A) -> Value { - Value(RmpDeserializer::new(Cursor::new(value))) - } -} - -impl<'de, A> de::IntoDeserializer<'de, RmpError> for Value -where A: AsRef<[u8]>, -{ - type Deserializer = Self; - - fn into_deserializer(self) -> Self::Deserializer { - self - } -} - -impl<'de, 'a, A> de::Deserializer<'de> for Value -where A: AsRef<[u8]>, -{ - type Error = RmpError; - - fn deserialize_any(mut self, visitor: V) -> Result - where V: de::Visitor<'de> - { - self.0.deserialize_any(visitor) - } - - forward_to_deserialize_any! { - bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string - bytes byte_buf option unit unit_struct newtype_struct seq tuple - tuple_struct map struct enum identifier ignored_any - } -} diff --git a/meilidb-data/src/lib.rs b/meilidb-data/src/lib.rs index a19418a3f..542741171 100644 --- a/meilidb-data/src/lib.rs +++ b/meilidb-data/src/lib.rs @@ -3,6 +3,7 @@ mod index_event; mod indexer; mod number; mod ranked_map; +mod serde; pub mod schema; pub use self::database::{Database, Index}; diff --git a/meilidb-data/src/serde/deserializer.rs b/meilidb-data/src/serde/deserializer.rs new file mode 100644 index 000000000..12873713b --- /dev/null +++ b/meilidb-data/src/serde/deserializer.rs @@ -0,0 +1,97 @@ +use std::collections::HashSet; +use std::io::Cursor; + +use meilidb_core::DocumentId; +use rmp_serde::decode::{Deserializer as RmpDeserializer, ReadReader}; +use rmp_serde::decode::{Error as RmpError}; +use serde::{de, forward_to_deserialize_any}; + +use crate::database::RawIndex; +use crate::SchemaAttr; + +pub struct Deserializer<'a> { + pub document_id: DocumentId, + pub raw_index: &'a RawIndex, + pub fields: Option<&'a HashSet>, +} + +impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> +{ + type Error = RmpError; + + fn deserialize_any(self, visitor: V) -> Result + where V: de::Visitor<'de> + { + self.deserialize_map(visitor) + } + + forward_to_deserialize_any! { + bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit seq + bytes byte_buf unit_struct tuple_struct + identifier tuple ignored_any option newtype_struct enum struct + } + + fn deserialize_map(self, visitor: V) -> Result + where V: de::Visitor<'de> + { + let document_attributes = self.raw_index.get_document_fields(self.document_id); + let document_attributes = document_attributes.filter_map(|result| { + match result { + Ok(value) => Some(value), + Err(e) => { + // TODO: must log the error + // error!("sled iter error; {}", e); + None + }, + } + }); + let iter = document_attributes.filter_map(|(_, attr, value)| { + if self.fields.map_or(true, |f| f.contains(&attr)) { + let attribute_name = self.raw_index.schema().attribute_name(attr); + Some((attribute_name, Value::new(value))) + } else { + None + } + }); + + let map_deserializer = de::value::MapDeserializer::new(iter); + visitor.visit_map(map_deserializer) + } +} + +struct Value(RmpDeserializer>>) where A: AsRef<[u8]>; + +impl Value where A: AsRef<[u8]> +{ + fn new(value: A) -> Value { + Value(RmpDeserializer::new(Cursor::new(value))) + } +} + +impl<'de, A> de::IntoDeserializer<'de, RmpError> for Value +where A: AsRef<[u8]>, +{ + type Deserializer = Self; + + fn into_deserializer(self) -> Self::Deserializer { + self + } +} + +impl<'de, 'a, A> de::Deserializer<'de> for Value +where A: AsRef<[u8]>, +{ + type Error = RmpError; + + fn deserialize_any(mut self, visitor: V) -> Result + where V: de::Visitor<'de> + { + self.0.deserialize_any(visitor) + } + + forward_to_deserialize_any! { + bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string + bytes byte_buf option unit unit_struct newtype_struct seq tuple + tuple_struct map struct enum identifier ignored_any + } +} diff --git a/meilidb-data/src/serde/mod.rs b/meilidb-data/src/serde/mod.rs new file mode 100644 index 000000000..beb7660e9 --- /dev/null +++ b/meilidb-data/src/serde/mod.rs @@ -0,0 +1,3 @@ +mod deserializer; + +pub use self::deserializer::Deserializer; From 2a69170f14edb778b650b3dd628e78e3d014c16d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Apr 2019 13:41:52 +0200 Subject: [PATCH 33/44] feat: Introduce the DocumentsDeletion type --- meilidb-data/src/database.rs | 57 ++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 927d22e21..fea4d84d1 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -13,6 +13,7 @@ use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor}; use meilidb_core::write_to_bytes::WriteToBytes; use meilidb_core::{DocumentId, Index as WordIndex}; use rmp_serde::decode::{Error as RmpError}; +use sdset::SetBuf; use serde::de; use sled::IVec; @@ -222,8 +223,12 @@ impl RawIndex { self.ranked_map.lease() } - pub fn update_word_index(&self, word_index: Arc) { - self.word_index.store(word_index) + pub fn update_word_index(&self, word_index: Arc) -> sled::Result<()> { + let data = word_index.into_bytes(); + self.inner.set("word-index", data).map(drop)?; + self.word_index.store(word_index); + + Ok(()) } pub fn update_ranked_map(&self, ranked_map: Arc) { @@ -316,6 +321,16 @@ impl Index { self.0.ranked_map() } + pub fn documents_addition(&self) -> DocumentsAddition { + let index = self.0.clone(); + DocumentsAddition::from_raw(index) + } + + pub fn documents_deletion(&self) -> DocumentsDeletion { + let index = self.0.clone(); + DocumentsDeletion::from_raw(index) + } + pub fn document( &self, fields: Option<&HashSet<&str>>, @@ -342,3 +357,41 @@ impl Index { T::deserialize(&mut deserializer).map(Some) } } + +pub struct DocumentsAddition(RawIndex); + +impl DocumentsAddition { + pub fn from_raw(inner: RawIndex) -> DocumentsAddition { + unimplemented!() + } +} + +pub struct DocumentsDeletion { + inner: RawIndex, + documents: Vec, +} + +impl DocumentsDeletion { + pub fn from_raw(inner: RawIndex) -> DocumentsDeletion { + DocumentsDeletion { inner, documents: Vec::new() } + } + + pub fn delete_document(&mut self, id: DocumentId) { + self.documents.push(id); + } + + pub fn commit(mut self) -> Result<(), Error> { + self.documents.sort_unstable(); + self.documents.dedup(); + + let idset = SetBuf::new_unchecked(self.documents); + let index = self.inner.word_index(); + + let new_index = index.remove_documents(&idset); + let new_index = Arc::from(new_index); + + self.inner.update_word_index(new_index)?; + + Ok(()) + } +} From ea0ee070ef4fb2b228603ea9382317e4c53a0cd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Apr 2019 15:50:53 +0200 Subject: [PATCH 34/44] feat: Introduce the Serializer Which will serialize documents fields as message pack in the kv-store --- meilidb-data/src/database.rs | 24 +- meilidb-data/src/indexer.rs | 7 + meilidb-data/src/serde/extract_string.rs | 146 +++++++++++++ meilidb-data/src/serde/mod.rs | 70 ++++++ meilidb-data/src/serde/serializer.rs | 266 +++++++++++++++++++++++ 5 files changed, 508 insertions(+), 5 deletions(-) create mode 100644 meilidb-data/src/serde/extract_string.rs create mode 100644 meilidb-data/src/serde/serializer.rs diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index fea4d84d1..ef67227bd 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -19,6 +19,7 @@ use sled::IVec; use crate::{Schema, SchemaAttr, RankedMap}; use crate::serde::Deserializer; +use crate::indexer::Indexer; #[derive(Debug)] pub enum Error { @@ -240,7 +241,7 @@ impl RawIndex { id: DocumentId, attr: SchemaAttr, value: V, - ) -> Result, Error> + ) -> Result, sled::Error> where IVec: From, { let key = document_key(id, attr); @@ -251,7 +252,7 @@ impl RawIndex { &self, id: DocumentId, attr: SchemaAttr - ) -> Result, Error> + ) -> Result, sled::Error> { let key = document_key(id, attr); Ok(self.inner.get(key)?) @@ -267,7 +268,7 @@ impl RawIndex { &self, id: DocumentId, attr: SchemaAttr - ) -> Result, Error> + ) -> Result, sled::Error> { let key = document_key(id, attr); Ok(self.inner.del(key)?) @@ -358,10 +359,23 @@ impl Index { } } -pub struct DocumentsAddition(RawIndex); +pub struct DocumentsAddition { + inner: RawIndex, + indexer: Indexer, +} impl DocumentsAddition { pub fn from_raw(inner: RawIndex) -> DocumentsAddition { + DocumentsAddition { inner, indexer: Indexer::new() } + } + + pub fn update_document(&mut self, document: D) -> Result<(), Error> + where D: serde::Serialize, + { + unimplemented!() + } + + pub fn finalize(self) -> sled::Result<()> { unimplemented!() } } @@ -380,7 +394,7 @@ impl DocumentsDeletion { self.documents.push(id); } - pub fn commit(mut self) -> Result<(), Error> { + pub fn finalize(mut self) -> Result<(), Error> { self.documents.sort_unstable(); self.documents.dedup(); diff --git a/meilidb-data/src/indexer.rs b/meilidb-data/src/indexer.rs index 82a4ae156..a1be35a93 100644 --- a/meilidb-data/src/indexer.rs +++ b/meilidb-data/src/indexer.rs @@ -23,6 +23,13 @@ impl Indexer { } } + pub fn with_word_limit(limit: usize) -> Indexer { + Indexer { + word_limit: limit, + indexed: BTreeMap::new(), + } + } + pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) { for token in Tokenizer::new(text) { if token.word_index >= self.word_limit { break } diff --git a/meilidb-data/src/serde/extract_string.rs b/meilidb-data/src/serde/extract_string.rs new file mode 100644 index 000000000..9c3ef25e5 --- /dev/null +++ b/meilidb-data/src/serde/extract_string.rs @@ -0,0 +1,146 @@ +use serde::Serialize; +use serde::ser; + +use super::SerializerError; + +pub struct ExtractString; + +impl ser::Serializer for ExtractString { + type Ok = String; + type Error = SerializerError; + type SerializeSeq = ser::Impossible; + type SerializeTuple = ser::Impossible; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = ser::Impossible; + type SerializeStruct = ser::Impossible; + type SerializeStructVariant = ser::Impossible; + + forward_to_unserializable_type! { + bool => serialize_bool, + char => serialize_char, + + i8 => serialize_i8, + i16 => serialize_i16, + i32 => serialize_i32, + i64 => serialize_i64, + + u8 => serialize_u8, + u16 => serialize_u16, + u32 => serialize_u32, + u64 => serialize_u64, + + f32 => serialize_f32, + f64 => serialize_f64, + } + + fn serialize_str(self, value: &str) -> Result { + Ok(value.to_string()) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(SerializerError::UnserializableType { name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_some(self, _value: &T) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnserializableType { name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnserializableType { name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnserializableType { name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { name: "sequence" }) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(SerializerError::UnserializableType { name: "tuple" }) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { name: "map" }) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "struct" }) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "struct variant" }) + } +} diff --git a/meilidb-data/src/serde/mod.rs b/meilidb-data/src/serde/mod.rs index beb7660e9..284c970cf 100644 --- a/meilidb-data/src/serde/mod.rs +++ b/meilidb-data/src/serde/mod.rs @@ -1,3 +1,73 @@ +macro_rules! forward_to_unserializable_type { + ($($ty:ident => $se_method:ident,)*) => { + $( + fn $se_method(self, _v: $ty) -> Result { + Err(SerializerError::UnserializableType { name: "$ty" }) + } + )* + } +} + mod deserializer; +mod serializer; +mod extract_string; pub use self::deserializer::Deserializer; +pub use self::serializer::Serializer; +pub use self::extract_string::ExtractString; + +use std::{fmt, error::Error}; +use rmp_serde::encode::Error as RmpError; +use serde::ser; + +#[derive(Debug)] +pub enum SerializerError { + DocumentIdNotFound, + RmpError(RmpError), + SledError(sled::Error), + UnserializableType { name: &'static str }, + Custom(String), +} + +impl ser::Error for SerializerError { + fn custom(msg: T) -> Self { + SerializerError::Custom(msg.to_string()) + } +} + +impl fmt::Display for SerializerError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + SerializerError::DocumentIdNotFound => { + write!(f, "serialized document does not have an id according to the schema") + } + SerializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e), + SerializerError::SledError(e) => write!(f, "sled related error: {}", e), + SerializerError::UnserializableType { name } => { + write!(f, "Only struct and map types are considered valid documents and + can be serialized, not {} types directly.", name) + }, + SerializerError::Custom(s) => f.write_str(&s), + } + } +} + +impl Error for SerializerError {} + +impl From for SerializerError { + fn from(value: String) -> SerializerError { + SerializerError::Custom(value) + } +} + +impl From for SerializerError { + fn from(error: RmpError) -> SerializerError { + SerializerError::RmpError(error) + } +} + +impl From for SerializerError { + fn from(error: sled::Error) -> SerializerError { + SerializerError::SledError(error) + } +} diff --git a/meilidb-data/src/serde/serializer.rs b/meilidb-data/src/serde/serializer.rs new file mode 100644 index 000000000..7a5808cfd --- /dev/null +++ b/meilidb-data/src/serde/serializer.rs @@ -0,0 +1,266 @@ +use std::collections::{HashSet, HashMap}; +use std::fmt; +use std::error::Error; + +use meilidb_core::DocumentId; +use serde::{de, ser}; + +use crate::schema::Schema; +use crate::database::RawIndex; +use super::{SerializerError, ExtractString}; + +pub struct Serializer<'a> { + pub schema: &'a Schema, + pub index: &'a RawIndex, + pub document_id: DocumentId, +} + +impl<'a> ser::Serializer for Serializer<'a> { + type Ok = (); + type Error = SerializerError; + type SerializeSeq = ser::Impossible; + type SerializeTuple = ser::Impossible; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = MapSerializer<'a>; + type SerializeStruct = StructSerializer<'a>; + type SerializeStructVariant = ser::Impossible; + + forward_to_unserializable_type! { + bool => serialize_bool, + char => serialize_char, + + i8 => serialize_i8, + i16 => serialize_i16, + i32 => serialize_i32, + i64 => serialize_i64, + + u8 => serialize_u8, + u16 => serialize_u16, + u32 => serialize_u32, + u64 => serialize_u64, + + f32 => serialize_f32, + f64 => serialize_f64, + } + + fn serialize_str(self, _v: &str) -> Result { + Err(SerializerError::UnserializableType { name: "str" }) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(SerializerError::UnserializableType { name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_some(self, _value: &T) -> Result + where T: ser::Serialize, + { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnserializableType { name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnserializableType { name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnserializableType { name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: ser::Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: ser::Serialize, + { + Err(SerializerError::UnserializableType { name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { name: "sequence" }) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(SerializerError::UnserializableType { name: "tuple" }) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + Ok(MapSerializer { + schema: self.schema, + document_id: self.document_id, + index: self.index, + current_key_name: None, + }) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Ok(StructSerializer { + schema: self.schema, + document_id: self.document_id, + index: self.index, + }) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "struct variant" }) + } +} + +pub struct MapSerializer<'a> { + pub schema: &'a Schema, + pub document_id: DocumentId, + pub index: &'a RawIndex, + pub current_key_name: Option, +} + +impl<'a> ser::SerializeMap for MapSerializer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> + where T: ser::Serialize, + { + let key = key.serialize(ExtractString)?; + self.current_key_name = Some(key); + Ok(()) + } + + fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> + where T: ser::Serialize, + { + let key = self.current_key_name.take().unwrap(); + self.serialize_entry(&key, value) + } + + fn serialize_entry( + &mut self, + key: &K, + value: &V, + ) -> Result<(), Self::Error> + where K: ser::Serialize, V: ser::Serialize, + { + let key = key.serialize(ExtractString)?; + + serialize_value( + self.schema, + self.document_id, + self.index, + &key, + value, + ) + } + + fn end(self) -> Result { + Ok(()) + } +} + +pub struct StructSerializer<'a> { + pub schema: &'a Schema, + pub document_id: DocumentId, + pub index: &'a RawIndex, +} + +impl<'a> ser::SerializeStruct for StructSerializer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_field( + &mut self, + key: &'static str, + value: &T, + ) -> Result<(), Self::Error> + where T: ser::Serialize, + { + serialize_value( + self.schema, + self.document_id, + self.index, + key, + value, + ) + } + + fn end(self) -> Result { + Ok(()) + } +} + +fn serialize_value( + schema: &Schema, + document_id: DocumentId, + index: &RawIndex, + key: &str, + value: &T, +) -> Result<(), SerializerError> +where T: ser::Serialize, +{ + if let Some(attr) = schema.attribute(key) { + let props = schema.props(attr); + + if props.is_stored() { + let value = rmp_serde::to_vec_named(value)?; + index.set_document_attribute(document_id, attr, value)?; + } + } + + Ok(()) +} From abd7d1de48039f97a0c1d1d4f8c2e27de245b0c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Apr 2019 16:19:50 +0200 Subject: [PATCH 35/44] feat: Introduce the extract_document_id function --- meilidb-data/src/serde/extract_document_id.rs | 259 ++++++++++++++++++ meilidb-data/src/serde/mod.rs | 2 + meilidb-data/src/serde/serializer.rs | 6 +- 3 files changed, 262 insertions(+), 5 deletions(-) create mode 100644 meilidb-data/src/serde/extract_document_id.rs diff --git a/meilidb-data/src/serde/extract_document_id.rs b/meilidb-data/src/serde/extract_document_id.rs new file mode 100644 index 000000000..5310da538 --- /dev/null +++ b/meilidb-data/src/serde/extract_document_id.rs @@ -0,0 +1,259 @@ +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; + +use meilidb_core::DocumentId; +use serde::Serialize; +use serde::ser; + +use super::{SerializerError, ExtractString}; + +pub fn extract_document_id( + identifier: &str, + document: &D, +) -> Result, SerializerError> +where D: serde::Serialize, +{ + let serializer = ExtractDocumentId { identifier }; + document.serialize(serializer) +} + +fn calculate_hash(t: &T) -> u64 { + let mut s = DefaultHasher::new(); + t.hash(&mut s); + s.finish() +} + +struct ExtractDocumentId<'a> { + identifier: &'a str, +} + +impl<'a> ser::Serializer for ExtractDocumentId<'a> { + type Ok = Option; + type Error = SerializerError; + type SerializeSeq = ser::Impossible; + type SerializeTuple = ser::Impossible; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = ExtractDocumentIdMapSerializer<'a>; + type SerializeStruct = ExtractDocumentIdStructSerializer<'a>; + type SerializeStructVariant = ser::Impossible; + + forward_to_unserializable_type! { + bool => serialize_bool, + char => serialize_char, + + i8 => serialize_i8, + i16 => serialize_i16, + i32 => serialize_i32, + i64 => serialize_i64, + + u8 => serialize_u8, + u16 => serialize_u16, + u32 => serialize_u32, + u64 => serialize_u64, + + f32 => serialize_f32, + f64 => serialize_f64, + } + + fn serialize_str(self, value: &str) -> Result { + Err(SerializerError::UnserializableType { name: "str" }) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(SerializerError::UnserializableType { name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_some(self, _value: &T) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnserializableType { name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnserializableType { name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnserializableType { name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { name: "sequence" }) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(SerializerError::UnserializableType { name: "tuple" }) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + let serializer = ExtractDocumentIdMapSerializer { + identifier: self.identifier, + document_id: None, + current_key_name: None, + }; + + Ok(serializer) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + let serializer = ExtractDocumentIdStructSerializer { + identifier: self.identifier, + document_id: None, + }; + + Ok(serializer) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "struct variant" }) + } +} + +pub struct ExtractDocumentIdMapSerializer<'a> { + identifier: &'a str, + document_id: Option, + current_key_name: Option, +} + +impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> { + type Ok = Option; + type Error = SerializerError; + + fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> + where T: Serialize, + { + let key = key.serialize(ExtractString)?; + self.current_key_name = Some(key); + Ok(()) + } + + fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> + where T: Serialize, + { + let key = self.current_key_name.take().unwrap(); + self.serialize_entry(&key, value) + } + + fn serialize_entry( + &mut self, + key: &K, + value: &V + ) -> Result<(), Self::Error> + where K: Serialize, V: Serialize, + { + let key = key.serialize(ExtractString)?; + + if self.identifier == key { + // TODO is it possible to have multiple ids? + let id = bincode::serialize(value).unwrap(); + let hash = calculate_hash(&id); + self.document_id = Some(DocumentId(hash)); + } + + Ok(()) + } + + fn end(self) -> Result { + Ok(self.document_id) + } +} + +pub struct ExtractDocumentIdStructSerializer<'a> { + identifier: &'a str, + document_id: Option, +} + +impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> { + type Ok = Option; + type Error = SerializerError; + + fn serialize_field( + &mut self, + key: &'static str, + value: &T + ) -> Result<(), Self::Error> + where T: Serialize, + { + if self.identifier == key { + // TODO can it be possible to have multiple ids? + let id = bincode::serialize(value).unwrap(); + let hash = calculate_hash(&id); + self.document_id = Some(DocumentId(hash)); + } + + Ok(()) + } + + fn end(self) -> Result { + Ok(self.document_id) + } +} diff --git a/meilidb-data/src/serde/mod.rs b/meilidb-data/src/serde/mod.rs index 284c970cf..cf85e60be 100644 --- a/meilidb-data/src/serde/mod.rs +++ b/meilidb-data/src/serde/mod.rs @@ -11,10 +11,12 @@ macro_rules! forward_to_unserializable_type { mod deserializer; mod serializer; mod extract_string; +mod extract_document_id; pub use self::deserializer::Deserializer; pub use self::serializer::Serializer; pub use self::extract_string::ExtractString; +pub use self::extract_document_id::extract_document_id; use std::{fmt, error::Error}; use rmp_serde::encode::Error as RmpError; diff --git a/meilidb-data/src/serde/serializer.rs b/meilidb-data/src/serde/serializer.rs index 7a5808cfd..9be35c2dc 100644 --- a/meilidb-data/src/serde/serializer.rs +++ b/meilidb-data/src/serde/serializer.rs @@ -1,9 +1,5 @@ -use std::collections::{HashSet, HashMap}; -use std::fmt; -use std::error::Error; - use meilidb_core::DocumentId; -use serde::{de, ser}; +use serde::ser; use crate::schema::Schema; use crate::database::RawIndex; From 645bab774843b5bdd4394b188b5f9d5da81e7ff5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Apr 2019 18:03:32 +0200 Subject: [PATCH 36/44] feat: Index documents using the Serializer struct --- meilidb-core/src/query_builder.rs | 2 +- ...extract_string.rs => convert_to_string.rs} | 88 ++++--- meilidb-data/src/serde/extract_document_id.rs | 32 +-- meilidb-data/src/serde/indexer.rs | 233 ++++++++++++++++++ meilidb-data/src/serde/mod.rs | 25 +- meilidb-data/src/serde/serializer.rs | 56 +++-- 6 files changed, 363 insertions(+), 73 deletions(-) rename meilidb-data/src/serde/{extract_string.rs => convert_to_string.rs} (53%) create mode 100644 meilidb-data/src/serde/indexer.rs diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 4d2327871..ad7de9c15 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -94,7 +94,7 @@ where I: Deref, let stream = self.index.map.search(automaton); op_builder.push(stream); } - op_builder.union() + op_builder.r#union() }; let mut matches = Vec::new(); diff --git a/meilidb-data/src/serde/extract_string.rs b/meilidb-data/src/serde/convert_to_string.rs similarity index 53% rename from meilidb-data/src/serde/extract_string.rs rename to meilidb-data/src/serde/convert_to_string.rs index 9c3ef25e5..67e592e78 100644 --- a/meilidb-data/src/serde/extract_string.rs +++ b/meilidb-data/src/serde/convert_to_string.rs @@ -3,9 +3,9 @@ use serde::ser; use super::SerializerError; -pub struct ExtractString; +pub struct ConvertToString; -impl ser::Serializer for ExtractString { +impl ser::Serializer for ConvertToString { type Ok = String; type Error = SerializerError; type SerializeSeq = ser::Impossible; @@ -16,22 +16,52 @@ impl ser::Serializer for ExtractString { type SerializeStruct = ser::Impossible; type SerializeStructVariant = ser::Impossible; - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, + fn serialize_bool(self, value: bool) -> Result { + Err(SerializerError::UnserializableType { type_name: "boolean" }) + } - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, + fn serialize_char(self, value: char) -> Result { + Ok(value.to_string()) + } - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, + fn serialize_i8(self, value: i8) -> Result { + Ok(value.to_string()) + } - f32 => serialize_f32, - f64 => serialize_f64, + fn serialize_i16(self, value: i16) -> Result { + Ok(value.to_string()) + } + + fn serialize_i32(self, value: i32) -> Result { + Ok(value.to_string()) + } + + fn serialize_i64(self, value: i64) -> Result { + Ok(value.to_string()) + } + + fn serialize_u8(self, value: u8) -> Result { + Ok(value.to_string()) + } + + fn serialize_u16(self, value: u16) -> Result { + Ok(value.to_string()) + } + + fn serialize_u32(self, value: u32) -> Result { + Ok(value.to_string()) + } + + fn serialize_u64(self, value: u64) -> Result { + Ok(value.to_string()) + } + + fn serialize_f32(self, value: f32) -> Result { + Ok(value.to_string()) + } + + fn serialize_f64(self, value: f64) -> Result { + Ok(value.to_string()) } fn serialize_str(self, value: &str) -> Result { @@ -39,25 +69,25 @@ impl ser::Serializer for ExtractString { } fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) + Err(SerializerError::UnserializableType { type_name: "&[u8]" }) } fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) + Err(SerializerError::UnserializableType { type_name: "Option" }) } fn serialize_some(self, _value: &T) -> Result where T: Serialize, { - Err(SerializerError::UnserializableType { name: "Option" }) + Err(SerializerError::UnserializableType { type_name: "Option" }) } fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) + Err(SerializerError::UnserializableType { type_name: "()" }) } fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) + Err(SerializerError::UnserializableType { type_name: "unit struct" }) } fn serialize_unit_variant( @@ -67,7 +97,7 @@ impl ser::Serializer for ExtractString { _variant: &'static str ) -> Result { - Err(SerializerError::UnserializableType { name: "unit variant" }) + Err(SerializerError::UnserializableType { type_name: "unit variant" }) } fn serialize_newtype_struct( @@ -89,15 +119,15 @@ impl ser::Serializer for ExtractString { ) -> Result where T: Serialize, { - Err(SerializerError::UnserializableType { name: "newtype variant" }) + Err(SerializerError::UnserializableType { type_name: "newtype variant" }) } fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) + Err(SerializerError::UnserializableType { type_name: "sequence" }) } fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) + Err(SerializerError::UnserializableType { type_name: "tuple" }) } fn serialize_tuple_struct( @@ -106,7 +136,7 @@ impl ser::Serializer for ExtractString { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "tuple struct" }) + Err(SerializerError::UnserializableType { type_name: "tuple struct" }) } fn serialize_tuple_variant( @@ -117,11 +147,11 @@ impl ser::Serializer for ExtractString { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "tuple variant" }) + Err(SerializerError::UnserializableType { type_name: "tuple variant" }) } fn serialize_map(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "map" }) + Err(SerializerError::UnserializableType { type_name: "map" }) } fn serialize_struct( @@ -130,7 +160,7 @@ impl ser::Serializer for ExtractString { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "struct" }) + Err(SerializerError::UnserializableType { type_name: "struct" }) } fn serialize_struct_variant( @@ -141,6 +171,6 @@ impl ser::Serializer for ExtractString { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "struct variant" }) + Err(SerializerError::UnserializableType { type_name: "struct variant" }) } } diff --git a/meilidb-data/src/serde/extract_document_id.rs b/meilidb-data/src/serde/extract_document_id.rs index 5310da538..d7c6bb195 100644 --- a/meilidb-data/src/serde/extract_document_id.rs +++ b/meilidb-data/src/serde/extract_document_id.rs @@ -5,7 +5,7 @@ use meilidb_core::DocumentId; use serde::Serialize; use serde::ser; -use super::{SerializerError, ExtractString}; +use super::{SerializerError, ConvertToString}; pub fn extract_document_id( identifier: &str, @@ -57,29 +57,29 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> { } fn serialize_str(self, value: &str) -> Result { - Err(SerializerError::UnserializableType { name: "str" }) + Err(SerializerError::UnserializableType { type_name: "str" }) } fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) + Err(SerializerError::UnserializableType { type_name: "&[u8]" }) } fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) + Err(SerializerError::UnserializableType { type_name: "Option" }) } fn serialize_some(self, _value: &T) -> Result where T: Serialize, { - Err(SerializerError::UnserializableType { name: "Option" }) + Err(SerializerError::UnserializableType { type_name: "Option" }) } fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) + Err(SerializerError::UnserializableType { type_name: "()" }) } fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) + Err(SerializerError::UnserializableType { type_name: "unit struct" }) } fn serialize_unit_variant( @@ -89,7 +89,7 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> { _variant: &'static str ) -> Result { - Err(SerializerError::UnserializableType { name: "unit variant" }) + Err(SerializerError::UnserializableType { type_name: "unit variant" }) } fn serialize_newtype_struct( @@ -111,15 +111,15 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> { ) -> Result where T: Serialize, { - Err(SerializerError::UnserializableType { name: "newtype variant" }) + Err(SerializerError::UnserializableType { type_name: "newtype variant" }) } fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) + Err(SerializerError::UnserializableType { type_name: "sequence" }) } fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) + Err(SerializerError::UnserializableType { type_name: "tuple" }) } fn serialize_tuple_struct( @@ -128,7 +128,7 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "tuple struct" }) + Err(SerializerError::UnserializableType { type_name: "tuple struct" }) } fn serialize_tuple_variant( @@ -139,7 +139,7 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "tuple variant" }) + Err(SerializerError::UnserializableType { type_name: "tuple variant" }) } fn serialize_map(self, _len: Option) -> Result { @@ -174,7 +174,7 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "struct variant" }) + Err(SerializerError::UnserializableType { type_name: "struct variant" }) } } @@ -191,7 +191,7 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> { fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> where T: Serialize, { - let key = key.serialize(ExtractString)?; + let key = key.serialize(ConvertToString)?; self.current_key_name = Some(key); Ok(()) } @@ -210,7 +210,7 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> { ) -> Result<(), Self::Error> where K: Serialize, V: Serialize, { - let key = key.serialize(ExtractString)?; + let key = key.serialize(ConvertToString)?; if self.identifier == key { // TODO is it possible to have multiple ids? diff --git a/meilidb-data/src/serde/indexer.rs b/meilidb-data/src/serde/indexer.rs new file mode 100644 index 000000000..b8eb6c599 --- /dev/null +++ b/meilidb-data/src/serde/indexer.rs @@ -0,0 +1,233 @@ +use meilidb_core::DocumentId; +use serde::ser; +use serde::Serialize; + +use crate::database::RawIndex; +use crate::indexer::Indexer as RawIndexer; +use crate::schema::SchemaAttr; +use super::{SerializerError, ConvertToString}; + +pub struct Indexer<'a> { + pub attribute: SchemaAttr, + pub indexer: &'a mut RawIndexer, + pub document_id: DocumentId, +} + +impl<'a> ser::Serializer for Indexer<'a> { + type Ok = (); + type Error = SerializerError; + type SerializeSeq = SeqIndexer<'a>; + type SerializeTuple = ser::Impossible; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = ser::Impossible; + type SerializeStruct = ser::Impossible; + type SerializeStructVariant = ser::Impossible; + + fn serialize_bool(self, value: bool) -> Result { + Err(SerializerError::UnindexableType { type_name: "boolean" }) + } + + fn serialize_char(self, value: char) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_i8(self, value: i8) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_i16(self, value: i16) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_i32(self, value: i32) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_i64(self, value: i64) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_u8(self, value: u8) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_u16(self, value: u16) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_u32(self, value: u32) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_u64(self, value: u64) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_f32(self, value: f32) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_f64(self, value: f64) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_str(self, text: &str) -> Result { + self.indexer.index_text(self.document_id, self.attribute, text); + Ok(()) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(SerializerError::UnindexableType { type_name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnindexableType { type_name: "Option" }) + } + + fn serialize_some(self, value: &T) -> Result + where T: ser::Serialize, + { + let text = value.serialize(ConvertToString)?; + self.indexer.index_text(self.document_id, self.attribute, &text); + Ok(()) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnindexableType { type_name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnindexableType { type_name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnindexableType { type_name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: ser::Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: ser::Serialize, + { + Err(SerializerError::UnindexableType { type_name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + let indexer = SeqIndexer { + attribute: self.attribute, + document_id: self.document_id, + indexer: self.indexer, + texts: Vec::new(), + }; + + Ok(indexer) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(SerializerError::UnindexableType { type_name: "tuple" }) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnindexableType { type_name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnindexableType { type_name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + Err(SerializerError::UnindexableType { type_name: "map" }) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnindexableType { type_name: "struct" }) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnindexableType { type_name: "struct variant" }) + } +} + +pub struct SeqIndexer<'a> { + attribute: SchemaAttr, + document_id: DocumentId, + indexer: &'a mut RawIndexer, + texts: Vec, +} + +impl<'a> ser::SerializeSeq for SeqIndexer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> + where T: ser::Serialize + { + let text = value.serialize(ConvertToString)?; + self.texts.push(text); + + Ok(()) + } + + fn end(mut self) -> Result { + let texts = self.texts.iter().map(String::as_str); + self.indexer.index_text_seq(self.document_id, self.attribute, texts); + + Ok(()) + } +} diff --git a/meilidb-data/src/serde/mod.rs b/meilidb-data/src/serde/mod.rs index cf85e60be..94e3172dd 100644 --- a/meilidb-data/src/serde/mod.rs +++ b/meilidb-data/src/serde/mod.rs @@ -2,21 +2,23 @@ macro_rules! forward_to_unserializable_type { ($($ty:ident => $se_method:ident,)*) => { $( fn $se_method(self, _v: $ty) -> Result { - Err(SerializerError::UnserializableType { name: "$ty" }) + Err(SerializerError::UnserializableType { type_name: "$ty" }) } )* } } mod deserializer; -mod serializer; -mod extract_string; mod extract_document_id; +mod convert_to_string; +mod indexer; +mod serializer; pub use self::deserializer::Deserializer; -pub use self::serializer::Serializer; -pub use self::extract_string::ExtractString; pub use self::extract_document_id::extract_document_id; +pub use self::convert_to_string::ConvertToString; +pub use self::indexer::Indexer; +pub use self::serializer::Serializer; use std::{fmt, error::Error}; use rmp_serde::encode::Error as RmpError; @@ -27,7 +29,8 @@ pub enum SerializerError { DocumentIdNotFound, RmpError(RmpError), SledError(sled::Error), - UnserializableType { name: &'static str }, + UnserializableType { type_name: &'static str }, + UnindexableType { type_name: &'static str }, Custom(String), } @@ -45,11 +48,13 @@ impl fmt::Display for SerializerError { } SerializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e), SerializerError::SledError(e) => write!(f, "sled related error: {}", e), - SerializerError::UnserializableType { name } => { - write!(f, "Only struct and map types are considered valid documents and - can be serialized, not {} types directly.", name) + SerializerError::UnserializableType { type_name } => { + write!(f, "{} are not a serializable type", type_name) }, - SerializerError::Custom(s) => f.write_str(&s), + SerializerError::UnindexableType { type_name } => { + write!(f, "{} are not an indexable type", type_name) + }, + SerializerError::Custom(s) => f.write_str(s), } } } diff --git a/meilidb-data/src/serde/serializer.rs b/meilidb-data/src/serde/serializer.rs index 9be35c2dc..34adaa326 100644 --- a/meilidb-data/src/serde/serializer.rs +++ b/meilidb-data/src/serde/serializer.rs @@ -1,13 +1,15 @@ use meilidb_core::DocumentId; use serde::ser; -use crate::schema::Schema; use crate::database::RawIndex; -use super::{SerializerError, ExtractString}; +use crate::indexer::Indexer as RawIndexer; +use crate::schema::{Schema, SchemaAttr}; +use super::{SerializerError, ConvertToString, Indexer}; pub struct Serializer<'a> { pub schema: &'a Schema, pub index: &'a RawIndex, + pub indexer: &'a mut RawIndexer, pub document_id: DocumentId, } @@ -41,29 +43,29 @@ impl<'a> ser::Serializer for Serializer<'a> { } fn serialize_str(self, _v: &str) -> Result { - Err(SerializerError::UnserializableType { name: "str" }) + Err(SerializerError::UnserializableType { type_name: "str" }) } fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) + Err(SerializerError::UnserializableType { type_name: "&[u8]" }) } fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) + Err(SerializerError::UnserializableType { type_name: "Option" }) } fn serialize_some(self, _value: &T) -> Result where T: ser::Serialize, { - Err(SerializerError::UnserializableType { name: "Option" }) + Err(SerializerError::UnserializableType { type_name: "Option" }) } fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) + Err(SerializerError::UnserializableType { type_name: "()" }) } fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) + Err(SerializerError::UnserializableType { type_name: "unit struct" }) } fn serialize_unit_variant( @@ -73,7 +75,7 @@ impl<'a> ser::Serializer for Serializer<'a> { _variant: &'static str ) -> Result { - Err(SerializerError::UnserializableType { name: "unit variant" }) + Err(SerializerError::UnserializableType { type_name: "unit variant" }) } fn serialize_newtype_struct( @@ -95,15 +97,15 @@ impl<'a> ser::Serializer for Serializer<'a> { ) -> Result where T: ser::Serialize, { - Err(SerializerError::UnserializableType { name: "newtype variant" }) + Err(SerializerError::UnserializableType { type_name: "newtype variant" }) } fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) + Err(SerializerError::UnserializableType { type_name: "sequence" }) } fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) + Err(SerializerError::UnserializableType { type_name: "tuple" }) } fn serialize_tuple_struct( @@ -112,7 +114,7 @@ impl<'a> ser::Serializer for Serializer<'a> { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "tuple struct" }) + Err(SerializerError::UnserializableType { type_name: "tuple struct" }) } fn serialize_tuple_variant( @@ -123,7 +125,7 @@ impl<'a> ser::Serializer for Serializer<'a> { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "tuple variant" }) + Err(SerializerError::UnserializableType { type_name: "tuple variant" }) } fn serialize_map(self, _len: Option) -> Result { @@ -131,6 +133,7 @@ impl<'a> ser::Serializer for Serializer<'a> { schema: self.schema, document_id: self.document_id, index: self.index, + indexer: self.indexer, current_key_name: None, }) } @@ -145,6 +148,7 @@ impl<'a> ser::Serializer for Serializer<'a> { schema: self.schema, document_id: self.document_id, index: self.index, + indexer: self.indexer, }) } @@ -156,7 +160,7 @@ impl<'a> ser::Serializer for Serializer<'a> { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "struct variant" }) + Err(SerializerError::UnserializableType { type_name: "struct variant" }) } } @@ -164,6 +168,7 @@ pub struct MapSerializer<'a> { pub schema: &'a Schema, pub document_id: DocumentId, pub index: &'a RawIndex, + pub indexer: &'a mut RawIndexer, pub current_key_name: Option, } @@ -174,7 +179,7 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> { fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> where T: ser::Serialize, { - let key = key.serialize(ExtractString)?; + let key = key.serialize(ConvertToString)?; self.current_key_name = Some(key); Ok(()) } @@ -193,12 +198,13 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> { ) -> Result<(), Self::Error> where K: ser::Serialize, V: ser::Serialize, { - let key = key.serialize(ExtractString)?; + let key = key.serialize(ConvertToString)?; serialize_value( self.schema, self.document_id, self.index, + self.indexer, &key, value, ) @@ -213,6 +219,7 @@ pub struct StructSerializer<'a> { pub schema: &'a Schema, pub document_id: DocumentId, pub index: &'a RawIndex, + pub indexer: &'a mut RawIndexer, } impl<'a> ser::SerializeStruct for StructSerializer<'a> { @@ -230,6 +237,7 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> { self.schema, self.document_id, self.index, + self.indexer, key, value, ) @@ -244,6 +252,7 @@ fn serialize_value( schema: &Schema, document_id: DocumentId, index: &RawIndex, + indexer: &mut RawIndexer, key: &str, value: &T, ) -> Result<(), SerializerError> @@ -256,6 +265,19 @@ where T: ser::Serialize, let value = rmp_serde::to_vec_named(value)?; index.set_document_attribute(document_id, attr, value)?; } + + if props.is_indexed() { + let indexer = Indexer { + attribute: attr, + indexer: indexer, + document_id: document_id, + }; + value.serialize(indexer)?; + } + + if props.is_ranked() { + unimplemented!() + } } Ok(()) From ad24ef8a256ffb2e75e57cd3030f73e46f291892 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 21 Apr 2019 21:58:54 +0200 Subject: [PATCH 37/44] feat: Index words of structs, maps and tuples --- meilidb-data/src/serde/indexer.rs | 118 +++++++++++++++++++++++++-- meilidb-data/src/serde/serializer.rs | 18 ++-- 2 files changed, 120 insertions(+), 16 deletions(-) diff --git a/meilidb-data/src/serde/indexer.rs b/meilidb-data/src/serde/indexer.rs index b8eb6c599..c6e0d0c75 100644 --- a/meilidb-data/src/serde/indexer.rs +++ b/meilidb-data/src/serde/indexer.rs @@ -17,11 +17,11 @@ impl<'a> ser::Serializer for Indexer<'a> { type Ok = (); type Error = SerializerError; type SerializeSeq = SeqIndexer<'a>; - type SerializeTuple = ser::Impossible; + type SerializeTuple = TupleIndexer<'a>; type SerializeTupleStruct = ser::Impossible; type SerializeTupleVariant = ser::Impossible; - type SerializeMap = ser::Impossible; - type SerializeStruct = ser::Impossible; + type SerializeMap = MapIndexer<'a>; + type SerializeStruct = StructSerializer<'a>; type SerializeStructVariant = ser::Impossible; fn serialize_bool(self, value: bool) -> Result { @@ -156,7 +156,14 @@ impl<'a> ser::Serializer for Indexer<'a> { } fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnindexableType { type_name: "tuple" }) + let indexer = TupleIndexer { + attribute: self.attribute, + document_id: self.document_id, + indexer: self.indexer, + texts: Vec::new(), + }; + + Ok(indexer) } fn serialize_tuple_struct( @@ -180,7 +187,14 @@ impl<'a> ser::Serializer for Indexer<'a> { } fn serialize_map(self, _len: Option) -> Result { - Err(SerializerError::UnindexableType { type_name: "map" }) + let indexer = MapIndexer { + attribute: self.attribute, + document_id: self.document_id, + indexer: self.indexer, + texts: Vec::new(), + }; + + Ok(indexer) } fn serialize_struct( @@ -220,14 +234,104 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> { { let text = value.serialize(ConvertToString)?; self.texts.push(text); - Ok(()) } fn end(mut self) -> Result { let texts = self.texts.iter().map(String::as_str); self.indexer.index_text_seq(self.document_id, self.attribute, texts); - + Ok(()) + } +} + +pub struct MapIndexer<'a> { + attribute: SchemaAttr, + document_id: DocumentId, + indexer: &'a mut RawIndexer, + texts: Vec, +} + +impl<'a> ser::SerializeMap for MapIndexer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> + where T: ser::Serialize, + { + let text = key.serialize(ConvertToString)?; + self.texts.push(text); + Ok(()) + } + + fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> + where T: ser::Serialize, + { + let text = value.serialize(ConvertToString)?; + self.texts.push(text); + Ok(()) + } + + fn end(self) -> Result { + let texts = self.texts.iter().map(String::as_str); + self.indexer.index_text_seq(self.document_id, self.attribute, texts); + Ok(()) + } +} + +pub struct StructSerializer<'a> { + attribute: SchemaAttr, + document_id: DocumentId, + indexer: &'a mut RawIndexer, + texts: Vec, +} + +impl<'a> ser::SerializeStruct for StructSerializer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_field( + &mut self, + key: &'static str, + value: &T, + ) -> Result<(), Self::Error> + where T: ser::Serialize, + { + let key_text = key.to_owned(); + let value_text = value.serialize(ConvertToString)?; + self.texts.push(key_text); + self.texts.push(value_text); + Ok(()) + } + + fn end(self) -> Result { + let texts = self.texts.iter().map(String::as_str); + self.indexer.index_text_seq(self.document_id, self.attribute, texts); + Ok(()) + } +} + +pub struct TupleIndexer<'a> { + attribute: SchemaAttr, + document_id: DocumentId, + indexer: &'a mut RawIndexer, + texts: Vec, +} + +impl<'a> ser::SerializeTuple for TupleIndexer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> + where T: Serialize + { + let text = value.serialize(ConvertToString)?; + self.texts.push(text); + Ok(()) + } + + fn end(self) -> Result { + let texts = self.texts.iter().map(String::as_str); + self.indexer.index_text_seq(self.document_id, self.attribute, texts); Ok(()) } } diff --git a/meilidb-data/src/serde/serializer.rs b/meilidb-data/src/serde/serializer.rs index 34adaa326..d71c87f14 100644 --- a/meilidb-data/src/serde/serializer.rs +++ b/meilidb-data/src/serde/serializer.rs @@ -165,11 +165,11 @@ impl<'a> ser::Serializer for Serializer<'a> { } pub struct MapSerializer<'a> { - pub schema: &'a Schema, - pub document_id: DocumentId, - pub index: &'a RawIndex, - pub indexer: &'a mut RawIndexer, - pub current_key_name: Option, + schema: &'a Schema, + document_id: DocumentId, + index: &'a RawIndex, + indexer: &'a mut RawIndexer, + current_key_name: Option, } impl<'a> ser::SerializeMap for MapSerializer<'a> { @@ -216,10 +216,10 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> { } pub struct StructSerializer<'a> { - pub schema: &'a Schema, - pub document_id: DocumentId, - pub index: &'a RawIndex, - pub indexer: &'a mut RawIndexer, + schema: &'a Schema, + document_id: DocumentId, + index: &'a RawIndex, + indexer: &'a mut RawIndexer, } impl<'a> ser::SerializeStruct for StructSerializer<'a> { From ed6b6038ee66f51b3952d7e543653385699324a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 21 Apr 2019 22:40:21 +0200 Subject: [PATCH 38/44] feat: Finalize index merging on document insertion --- meilidb-data/src/database.rs | 39 +++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index ef67227bd..0e71e0f5b 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -18,7 +18,7 @@ use serde::de; use sled::IVec; use crate::{Schema, SchemaAttr, RankedMap}; -use crate::serde::Deserializer; +use crate::serde::{extract_document_id, Serializer, Deserializer, SerializerError}; use crate::indexer::Indexer; #[derive(Debug)] @@ -26,8 +26,10 @@ pub enum Error { SchemaDiffer, SchemaMissing, WordIndexMissing, + MissingDocumentId, SledError(sled::Error), BincodeError(bincode::Error), + SerializerError(SerializerError), } impl From for Error { @@ -42,6 +44,12 @@ impl From for Error { } } +impl From for Error { + fn from(error: SerializerError) -> Error { + Error::SerializerError(error) + } +} + fn index_name(name: &str) -> Vec { format!("index-{}", name).into_bytes() } @@ -372,11 +380,36 @@ impl DocumentsAddition { pub fn update_document(&mut self, document: D) -> Result<(), Error> where D: serde::Serialize, { - unimplemented!() + let schema = self.inner.schema(); + let identifier = schema.identifier_name(); + + let document_id = match extract_document_id(identifier, &document)? { + Some(id) => id, + None => return Err(Error::MissingDocumentId), + }; + + let serializer = Serializer { + schema, + index: &self.inner, + indexer: &mut self.indexer, + document_id, + }; + + document.serialize(serializer)?; + + Ok(()) } pub fn finalize(self) -> sled::Result<()> { - unimplemented!() + let delta_index = self.indexer.build(); + + let index = self.inner.word_index(); + let new_index = index.r#union(&delta_index); + + let new_index = Arc::from(new_index); + self.inner.update_word_index(new_index)?; + + Ok(()) } } From 7dbf5d63199832b14a80244699c75838ae321775 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 22 Apr 2019 15:26:43 +0200 Subject: [PATCH 39/44] fix: Make the examples build --- meilidb-data/Cargo.toml | 2 +- meilidb-data/src/database.rs | 28 +++++++++++++------- meilidb-data/src/serde/indexer.rs | 2 +- meilidb/Cargo.toml | 7 ++--- meilidb/examples/create-database.rs | 14 +++++----- meilidb/examples/query-database.rs | 40 ++++++++++++++--------------- 6 files changed, 51 insertions(+), 42 deletions(-) diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index e6fca8c66..e2744a962 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -16,7 +16,7 @@ ordered-float = { version = "1.0.2", features = ["serde"] } sdset = "0.3.1" serde = { version = "1.0.90", features = ["derive"] } serde_json = { version = "1.0.39", features = ["preserve_order"] } -sled = "0.22.1" +sled = "0.23.0" toml = { version = "0.5.0", features = ["preserve_order"] } [dependencies.rmp-serde] diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 0e71e0f5b..357693ad8 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -3,6 +3,7 @@ use std::io::{self, Cursor, BufRead}; use std::iter::FromIterator; use std::path::Path; use std::sync::Arc; +use std::{error, fmt}; use arc_swap::{ArcSwap, Lease}; use byteorder::{ReadBytesExt, BigEndian}; @@ -50,6 +51,23 @@ impl From for Error { } } +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + match self { + SchemaDiffer => write!(f, "schemas differ"), + SchemaMissing => write!(f, "this index does not have a schema"), + WordIndexMissing => write!(f, "this index does not have a word index"), + MissingDocumentId => write!(f, "document id is missing"), + SledError(e) => write!(f, "sled error; {}", e), + BincodeError(e) => write!(f, "bincode error; {}", e), + SerializerError(e) => write!(f, "serializer error; {}", e), + } + } +} + +impl error::Error for Error { } + fn index_name(name: &str) -> Vec { format!("index-{}", name).into_bytes() } @@ -96,13 +114,6 @@ fn extract_document_key(key: Vec) -> io::Result<(DocumentId, SchemaAttr)> { Ok((document_id, schema_attr)) } -fn ivec_into_arc(ivec: IVec) -> Arc<[u8]> { - match ivec { - IVec::Inline(len, bytes) => Arc::from(&bytes[..len as usize]), - IVec::Remote { buf } => buf, - } -} - #[derive(Clone)] pub struct Database { opened: Arc>>, @@ -185,7 +196,7 @@ impl RawIndex { let bytes = bytes.ok_or(Error::WordIndexMissing)?; let word_index = { let len = bytes.len(); - let bytes = ivec_into_arc(bytes); + let bytes: Arc<[u8]> = Into::into(bytes); let mut cursor = SharedDataCursor::from_shared_bytes(bytes, 0, len); // TODO must handle this error @@ -399,7 +410,6 @@ impl DocumentsAddition { Ok(()) } - pub fn finalize(self) -> sled::Result<()> { let delta_index = self.indexer.build(); diff --git a/meilidb-data/src/serde/indexer.rs b/meilidb-data/src/serde/indexer.rs index c6e0d0c75..8eb0b2c67 100644 --- a/meilidb-data/src/serde/indexer.rs +++ b/meilidb-data/src/serde/indexer.rs @@ -237,7 +237,7 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> { Ok(()) } - fn end(mut self) -> Result { + fn end(self) -> Result { let texts = self.texts.iter().map(String::as_str); self.indexer.index_text_seq(self.document_id, self.attribute, texts); Ok(()) diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml index e8cdb8d56..c2f4ad0fc 100644 --- a/meilidb/Cargo.toml +++ b/meilidb/Cargo.toml @@ -15,12 +15,13 @@ i128 = ["meilidb-core/i128"] nightly = ["meilidb-core/nightly"] [dev-dependencies] -csv = "1.0.5" -env_logger = "0.6.0" +csv = "1.0.7" +env_logger = "0.6.1" jemallocator = "0.1.9" quickcheck = "0.8.2" rand = "0.6.5" rand_xorshift = "0.1.1" -structopt = "0.2.14" +serde = { version = "1.0.90", features = ["derive"] } +structopt = "0.2.15" tempfile = "3.0.7" termcolor = "1.0.4" diff --git a/meilidb/examples/create-database.rs b/meilidb/examples/create-database.rs index e5d9c403a..b0bfa1127 100644 --- a/meilidb/examples/create-database.rs +++ b/meilidb/examples/create-database.rs @@ -9,10 +9,10 @@ use std::error::Error; use std::borrow::Cow; use std::fs::File; -use serde_derive::{Serialize, Deserialize}; +use serde::{Serialize, Deserialize}; use structopt::StructOpt; -use meilidb::database::{Database, Schema}; +use meilidb_data::{Database, Schema}; #[derive(Debug, StructOpt)] pub struct Opt { @@ -50,9 +50,9 @@ fn index( stop_words: &HashSet, ) -> Result> { - let database = Database::create(database_path)?; + let database = Database::start_default(database_path)?; - database.create_index("default", &schema)?; + let index = database.create_index("default".to_string(), schema.clone())?; let mut rdr = csv::Reader::from_path(csv_data_path)?; let mut raw_record = csv::StringRecord::new(); @@ -62,7 +62,7 @@ fn index( let mut end_of_file = false; while !end_of_file { - let mut update = database.start_update("default")?; + let mut update = index.documents_addition(); loop { end_of_file = !rdr.read_record(&mut raw_record)?; @@ -76,7 +76,7 @@ fn index( } }; - update.update_document(&document, &stop_words)?; + update.update_document(&document)?; print!("\rindexing document {}", i); i += 1; @@ -89,7 +89,7 @@ fn index( println!(); println!("committing update..."); - database.commit_update(update)?; + update.finalize()?; } Ok(database) diff --git a/meilidb/examples/query-database.rs b/meilidb/examples/query-database.rs index 2689ffe0f..f9a3bf8e5 100644 --- a/meilidb/examples/query-database.rs +++ b/meilidb/examples/query-database.rs @@ -2,19 +2,19 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; use std::collections::btree_map::{BTreeMap, Entry}; +use std::collections::{HashMap, HashSet}; use std::iter::FromIterator; use std::io::{self, Write}; use std::time::Instant; use std::path::PathBuf; use std::error::Error; -use hashbrown::{HashMap, HashSet}; use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use structopt::StructOpt; use meilidb_core::Match; -use meilidb::database::schema::SchemaAttr; -use meilidb::database::Database; +use meilidb_data::schema::SchemaAttr; +use meilidb_data::Database; #[derive(Debug, StructOpt)] pub struct Opt { @@ -138,12 +138,19 @@ fn main() -> Result<(), Box> { let opt = Opt::from_args(); let start = Instant::now(); - let database = Database::open(&opt.database_path)?; - println!("database prepared for you in {:.2?}", start.elapsed()); + let database = Database::start_default(&opt.database_path)?; let mut buffer = String::new(); let input = io::stdin(); + let index = database.open_index("default")?.unwrap(); + let schema = index.schema(); + + println!("database prepared for you in {:.2?}", start.elapsed()); + + let fields = opt.displayed_fields.iter().map(String::as_str); + let fields = HashSet::from_iter(fields); + loop { print!("Searching for: "); io::stdout().flush()?; @@ -151,12 +158,9 @@ fn main() -> Result<(), Box> { if input.read_line(&mut buffer)? == 0 { break } let query = buffer.trim_end_matches('\n'); - let view = database.view("default")?; - let schema = view.schema(); - let start = Instant::now(); - let builder = view.query_builder(); + let builder = index.query_builder(); let documents = builder.query(query, 0..opt.number_results); let number_of_documents = documents.len(); @@ -164,19 +168,12 @@ fn main() -> Result<(), Box> { doc.matches.sort_unstable_by_key(|m| (m.char_index, m.char_index)); - match view.document_by_id::(doc.id) { - Ok(document) => { - for name in &opt.displayed_fields { - let attr = match schema.attribute(name) { - Some(attr) => attr, - None => continue, - }; - let text = match document.get(name) { - Some(text) => text, - None => continue, - }; - + match index.document::(Some(&fields), doc.id) { + Ok(Some(document)) => { + for (name, text) in document { print!("{}: ", name); + + let attr = schema.attribute(&name).unwrap(); let matches = doc.matches.iter() .filter(|m| SchemaAttr::new(m.attribute) == attr) .cloned(); @@ -186,6 +183,7 @@ fn main() -> Result<(), Box> { println!(); } }, + Ok(None) => eprintln!("missing document"), Err(e) => eprintln!("{}", e), } From f0268d49fe3d84832b6408ccaeb6b3e142b683c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 22 Apr 2019 18:43:00 +0200 Subject: [PATCH 40/44] fix: Always lowercase indexed tokens --- meilidb-data/src/indexer.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/meilidb-data/src/indexer.rs b/meilidb-data/src/indexer.rs index a1be35a93..350cc9e00 100644 --- a/meilidb-data/src/indexer.rs +++ b/meilidb-data/src/indexer.rs @@ -33,6 +33,10 @@ impl Indexer { pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) { for token in Tokenizer::new(text) { if token.word_index >= self.word_limit { break } + + let lower = token.word.to_lowercase(); + let token = Token { word: &lower, ..token }; + let docindex = match token_to_docindex(id, attr, token) { Some(docindex) => docindex, None => break, @@ -49,6 +53,10 @@ impl Indexer { let iter = iter.into_iter(); for token in SeqTokenizer::new(iter) { if token.word_index >= self.word_limit { break } + + let lower = token.word.to_lowercase(); + let token = Token { word: &lower, ..token }; + let docindex = match token_to_docindex(id, attr, token) { Some(docindex) => docindex, None => break, From 7035f7607774987044870f96561062bca97abda4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 23 Apr 2019 17:12:21 +0200 Subject: [PATCH 41/44] squash-me: Make better measurements of the retrieving spent time --- meilidb/examples/query-database.rs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/meilidb/examples/query-database.rs b/meilidb/examples/query-database.rs index f9a3bf8e5..6b048cc5b 100644 --- a/meilidb/examples/query-database.rs +++ b/meilidb/examples/query-database.rs @@ -5,7 +5,7 @@ use std::collections::btree_map::{BTreeMap, Entry}; use std::collections::{HashMap, HashSet}; use std::iter::FromIterator; use std::io::{self, Write}; -use std::time::Instant; +use std::time::{Instant, Duration}; use std::path::PathBuf; use std::error::Error; @@ -158,17 +158,23 @@ fn main() -> Result<(), Box> { if input.read_line(&mut buffer)? == 0 { break } let query = buffer.trim_end_matches('\n'); - let start = Instant::now(); + let start_total = Instant::now(); let builder = index.query_builder(); let documents = builder.query(query, 0..opt.number_results); + let mut retrieve_duration = Duration::default(); + let number_of_documents = documents.len(); for mut doc in documents { doc.matches.sort_unstable_by_key(|m| (m.char_index, m.char_index)); - match index.document::(Some(&fields), doc.id) { + let start_retrieve = Instant::now(); + let result = index.document::(Some(&fields), doc.id); + retrieve_duration += start_retrieve.elapsed(); + + match result { Ok(Some(document)) => { for (name, text) in document { print!("{}: ", name); @@ -200,7 +206,8 @@ fn main() -> Result<(), Box> { println!(); } - eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start.elapsed()); + eprintln!("document field retrieve took {:.2?}", retrieve_duration); + eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed()); buffer.clear(); } From 068f1bc2025227dc418860ebeafc99f585b8fcf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 24 Apr 2019 21:06:00 +0200 Subject: [PATCH 42/44] feat: Index unidecoded words --- meilidb-data/Cargo.toml | 1 + meilidb-data/src/indexer.rs | 74 +++++++++++++++++++++++-------------- 2 files changed, 47 insertions(+), 28 deletions(-) diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml index e2744a962..6096e4ad3 100644 --- a/meilidb-data/Cargo.toml +++ b/meilidb-data/Cargo.toml @@ -18,6 +18,7 @@ serde = { version = "1.0.90", features = ["derive"] } serde_json = { version = "1.0.39", features = ["preserve_order"] } sled = "0.23.0" toml = { version = "0.5.0", features = ["preserve_order"] } +deunicode = "1.0.0" [dependencies.rmp-serde] git = "https://github.com/3Hren/msgpack-rust.git" diff --git a/meilidb-data/src/indexer.rs b/meilidb-data/src/indexer.rs index 350cc9e00..3cfd8f722 100644 --- a/meilidb-data/src/indexer.rs +++ b/meilidb-data/src/indexer.rs @@ -1,13 +1,14 @@ use std::collections::BTreeMap; use std::convert::TryFrom; +use deunicode::deunicode_with_tofu; use meilidb_core::{DocumentId, DocIndex}; use meilidb_core::{Index as WordIndex, IndexBuilder as WordIndexBuilder}; -use meilidb_tokenizer::{Tokenizer, SeqTokenizer, Token}; -use crate::SchemaAttr; - +use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token}; use sdset::Set; +use crate::SchemaAttr; + type Word = Vec; // TODO make it be a SmallVec pub struct Indexer { @@ -32,18 +33,8 @@ impl Indexer { pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) { for token in Tokenizer::new(text) { - if token.word_index >= self.word_limit { break } - - let lower = token.word.to_lowercase(); - let token = Token { word: &lower, ..token }; - - let docindex = match token_to_docindex(id, attr, token) { - Some(docindex) => docindex, - None => break, - }; - - let word = Vec::from(token.word); - self.indexed.entry(word).or_insert_with(Vec::new).push(docindex); + let must_continue = index_token(token, id, attr, self.word_limit, &mut self.indexed); + if !must_continue { break } } } @@ -52,18 +43,8 @@ impl Indexer { { let iter = iter.into_iter(); for token in SeqTokenizer::new(iter) { - if token.word_index >= self.word_limit { break } - - let lower = token.word.to_lowercase(); - let token = Token { word: &lower, ..token }; - - let docindex = match token_to_docindex(id, attr, token) { - Some(docindex) => docindex, - None => break, - }; - - let word = Vec::from(token.word); - self.indexed.entry(word).or_insert_with(Vec::new).push(docindex); + let must_continue = index_token(token, id, attr, self.word_limit, &mut self.indexed); + if !must_continue { break } } } @@ -82,7 +63,44 @@ impl Indexer { } } -fn token_to_docindex<'a>(id: DocumentId, attr: SchemaAttr, token: Token<'a>) -> Option { +fn index_token( + token: Token, + id: DocumentId, + attr: SchemaAttr, + word_limit: usize, + indexed: &mut BTreeMap>, +) -> bool +{ + if token.word_index >= word_limit { return false } + + let lower = token.word.to_lowercase(); + let token = Token { word: &lower, ..token }; + match token_to_docindex(id, attr, token) { + Some(docindex) => { + let word = Vec::from(token.word); + indexed.entry(word).or_insert_with(Vec::new).push(docindex); + }, + None => return false, + } + + if !lower.contains(is_cjk) { + let unidecoded = deunicode_with_tofu(&lower, ""); + if unidecoded != lower { + let token = Token { word: &unidecoded, ..token }; + match token_to_docindex(id, attr, token) { + Some(docindex) => { + let word = Vec::from(token.word); + indexed.entry(word).or_insert_with(Vec::new).push(docindex); + }, + None => return false, + } + } + } + + true +} + +fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option { let word_index = u16::try_from(token.word_index).ok()?; let char_index = u16::try_from(token.char_index).ok()?; let char_length = u16::try_from(token.word.chars().count()).ok()?; From 05476712462272c280f646c313f67290c56db918 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 26 Apr 2019 12:18:39 +0200 Subject: [PATCH 43/44] feat: Take ranked attributes into account --- meilidb-data/src/database.rs | 22 ++- meilidb-data/src/number.rs | 2 +- meilidb-data/src/serde/convert_to_number.rs | 180 ++++++++++++++++++++ meilidb-data/src/serde/mod.rs | 15 +- meilidb-data/src/serde/serializer.rs | 15 +- 5 files changed, 224 insertions(+), 10 deletions(-) create mode 100644 meilidb-data/src/serde/convert_to_number.rs diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs index 357693ad8..da43dd4bc 100644 --- a/meilidb-data/src/database.rs +++ b/meilidb-data/src/database.rs @@ -251,8 +251,12 @@ impl RawIndex { Ok(()) } - pub fn update_ranked_map(&self, ranked_map: Arc) { - self.ranked_map.store(ranked_map) + pub fn update_ranked_map(&self, ranked_map: Arc) -> sled::Result<()> { + let data = bincode::serialize(ranked_map.as_ref()).unwrap(); + self.inner.set("ranked-map", data).map(drop)?; + self.ranked_map.store(ranked_map); + + Ok(()) } pub fn set_document_attribute( @@ -343,7 +347,8 @@ impl Index { pub fn documents_addition(&self) -> DocumentsAddition { let index = self.0.clone(); - DocumentsAddition::from_raw(index) + let ranked_map = self.0.ranked_map().clone(); + DocumentsAddition::from_raw(index, ranked_map) } pub fn documents_deletion(&self) -> DocumentsDeletion { @@ -381,11 +386,12 @@ impl Index { pub struct DocumentsAddition { inner: RawIndex, indexer: Indexer, + ranked_map: RankedMap, } impl DocumentsAddition { - pub fn from_raw(inner: RawIndex) -> DocumentsAddition { - DocumentsAddition { inner, indexer: Indexer::new() } + pub fn from_raw(inner: RawIndex, ranked_map: RankedMap) -> DocumentsAddition { + DocumentsAddition { inner, indexer: Indexer::new(), ranked_map } } pub fn update_document(&mut self, document: D) -> Result<(), Error> @@ -403,6 +409,7 @@ impl DocumentsAddition { schema, index: &self.inner, indexer: &mut self.indexer, + ranked_map: &mut self.ranked_map, document_id, }; @@ -430,7 +437,10 @@ pub struct DocumentsDeletion { impl DocumentsDeletion { pub fn from_raw(inner: RawIndex) -> DocumentsDeletion { - DocumentsDeletion { inner, documents: Vec::new() } + DocumentsDeletion { + inner, + documents: Vec::new(), + } } pub fn delete_document(&mut self, id: DocumentId) { diff --git a/meilidb-data/src/number.rs b/meilidb-data/src/number.rs index 9a2d0ea24..5e64cc78f 100644 --- a/meilidb-data/src/number.rs +++ b/meilidb-data/src/number.rs @@ -36,7 +36,7 @@ impl FromStr for Number { } } -#[derive(Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct ParseNumberError { uint_error: ParseIntError, int_error: ParseIntError, diff --git a/meilidb-data/src/serde/convert_to_number.rs b/meilidb-data/src/serde/convert_to_number.rs new file mode 100644 index 000000000..57223a6c1 --- /dev/null +++ b/meilidb-data/src/serde/convert_to_number.rs @@ -0,0 +1,180 @@ +use std::str::FromStr; + +use ordered_float::OrderedFloat; +use serde::ser; +use serde::Serialize; + +use super::SerializerError; +use crate::Number; + +pub struct ConvertToNumber; + +impl ser::Serializer for ConvertToNumber { + type Ok = Number; + type Error = SerializerError; + type SerializeSeq = ser::Impossible; + type SerializeTuple = ser::Impossible; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = ser::Impossible; + type SerializeStruct = ser::Impossible; + type SerializeStructVariant = ser::Impossible; + + fn serialize_bool(self, value: bool) -> Result { + Ok(Number::Unsigned(u64::from(value))) + } + + fn serialize_char(self, value: char) -> Result { + unimplemented!() + } + + fn serialize_i8(self, value: i8) -> Result { + Ok(Number::Signed(i64::from(value))) + } + + fn serialize_i16(self, value: i16) -> Result { + Ok(Number::Signed(i64::from(value))) + } + + fn serialize_i32(self, value: i32) -> Result { + Ok(Number::Signed(i64::from(value))) + } + + fn serialize_i64(self, value: i64) -> Result { + Ok(Number::Signed(value)) + } + + fn serialize_u8(self, value: u8) -> Result { + Ok(Number::Unsigned(u64::from(value))) + } + + fn serialize_u16(self, value: u16) -> Result { + Ok(Number::Unsigned(u64::from(value))) + } + + fn serialize_u32(self, value: u32) -> Result { + Ok(Number::Unsigned(u64::from(value))) + } + + fn serialize_u64(self, value: u64) -> Result { + Ok(Number::Unsigned(value)) + } + + fn serialize_f32(self, value: f32) -> Result { + Ok(Number::Float(OrderedFloat(value as f64))) + } + + fn serialize_f64(self, value: f64) -> Result { + Ok(Number::Float(OrderedFloat(value))) + } + + fn serialize_str(self, value: &str) -> Result { + Ok(Number::from_str(value)?) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(SerializerError::UnserializableType { type_name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnserializableType { type_name: "Option" }) + } + + fn serialize_some(self, _value: &T) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { type_name: "Option" }) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnserializableType { type_name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnserializableType { type_name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { type_name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { type_name: "sequence" }) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(SerializerError::UnserializableType { type_name: "tuple" }) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { type_name: "map" }) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "struct" }) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "struct variant" }) + } +} diff --git a/meilidb-data/src/serde/mod.rs b/meilidb-data/src/serde/mod.rs index 94e3172dd..75209c574 100644 --- a/meilidb-data/src/serde/mod.rs +++ b/meilidb-data/src/serde/mod.rs @@ -8,27 +8,31 @@ macro_rules! forward_to_unserializable_type { } } +mod convert_to_number; +mod convert_to_string; mod deserializer; mod extract_document_id; -mod convert_to_string; mod indexer; mod serializer; pub use self::deserializer::Deserializer; pub use self::extract_document_id::extract_document_id; pub use self::convert_to_string::ConvertToString; +pub use self::convert_to_number::ConvertToNumber; pub use self::indexer::Indexer; pub use self::serializer::Serializer; use std::{fmt, error::Error}; use rmp_serde::encode::Error as RmpError; use serde::ser; +use crate::number::ParseNumberError; #[derive(Debug)] pub enum SerializerError { DocumentIdNotFound, RmpError(RmpError), SledError(sled::Error), + ParseNumberError(ParseNumberError), UnserializableType { type_name: &'static str }, UnindexableType { type_name: &'static str }, Custom(String), @@ -48,6 +52,9 @@ impl fmt::Display for SerializerError { } SerializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e), SerializerError::SledError(e) => write!(f, "sled related error: {}", e), + SerializerError::ParseNumberError(e) => { + write!(f, "error while trying to parse a number: {}", e) + }, SerializerError::UnserializableType { type_name } => { write!(f, "{} are not a serializable type", type_name) }, @@ -78,3 +85,9 @@ impl From for SerializerError { SerializerError::SledError(error) } } + +impl From for SerializerError { + fn from(error: ParseNumberError) -> SerializerError { + SerializerError::ParseNumberError(error) + } +} diff --git a/meilidb-data/src/serde/serializer.rs b/meilidb-data/src/serde/serializer.rs index d71c87f14..37b3c7036 100644 --- a/meilidb-data/src/serde/serializer.rs +++ b/meilidb-data/src/serde/serializer.rs @@ -2,14 +2,16 @@ use meilidb_core::DocumentId; use serde::ser; use crate::database::RawIndex; +use crate::ranked_map::RankedMap; use crate::indexer::Indexer as RawIndexer; use crate::schema::{Schema, SchemaAttr}; -use super::{SerializerError, ConvertToString, Indexer}; +use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer}; pub struct Serializer<'a> { pub schema: &'a Schema, pub index: &'a RawIndex, pub indexer: &'a mut RawIndexer, + pub ranked_map: &'a mut RankedMap, pub document_id: DocumentId, } @@ -134,6 +136,7 @@ impl<'a> ser::Serializer for Serializer<'a> { document_id: self.document_id, index: self.index, indexer: self.indexer, + ranked_map: self.ranked_map, current_key_name: None, }) } @@ -149,6 +152,7 @@ impl<'a> ser::Serializer for Serializer<'a> { document_id: self.document_id, index: self.index, indexer: self.indexer, + ranked_map: self.ranked_map, }) } @@ -169,6 +173,7 @@ pub struct MapSerializer<'a> { document_id: DocumentId, index: &'a RawIndex, indexer: &'a mut RawIndexer, + ranked_map: &'a mut RankedMap, current_key_name: Option, } @@ -205,6 +210,7 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> { self.document_id, self.index, self.indexer, + self.ranked_map, &key, value, ) @@ -220,6 +226,7 @@ pub struct StructSerializer<'a> { document_id: DocumentId, index: &'a RawIndex, indexer: &'a mut RawIndexer, + ranked_map: &'a mut RankedMap, } impl<'a> ser::SerializeStruct for StructSerializer<'a> { @@ -238,6 +245,7 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> { self.document_id, self.index, self.indexer, + self.ranked_map, key, value, ) @@ -253,6 +261,7 @@ fn serialize_value( document_id: DocumentId, index: &RawIndex, indexer: &mut RawIndexer, + ranked_map: &mut RankedMap, key: &str, value: &T, ) -> Result<(), SerializerError> @@ -276,7 +285,9 @@ where T: ser::Serialize, } if props.is_ranked() { - unimplemented!() + let key = (document_id, attr); + let number = value.serialize(ConvertToNumber)?; + ranked_map.insert(key, number); } } From 9023a12ad41a1d166bba74c279bc7f3875abad73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 26 Apr 2019 14:59:35 +0200 Subject: [PATCH 44/44] feat: Introduce the unrankable error variant --- meilidb-data/src/serde/convert_to_number.rs | 30 ++++++++++----------- meilidb-data/src/serde/mod.rs | 4 +++ 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/meilidb-data/src/serde/convert_to_number.rs b/meilidb-data/src/serde/convert_to_number.rs index 57223a6c1..cf30e3b62 100644 --- a/meilidb-data/src/serde/convert_to_number.rs +++ b/meilidb-data/src/serde/convert_to_number.rs @@ -25,7 +25,7 @@ impl ser::Serializer for ConvertToNumber { } fn serialize_char(self, value: char) -> Result { - unimplemented!() + Err(SerializerError::UnrankableType { type_name: "char" }) } fn serialize_i8(self, value: i8) -> Result { @@ -73,25 +73,25 @@ impl ser::Serializer for ConvertToNumber { } fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { type_name: "&[u8]" }) + Err(SerializerError::UnrankableType { type_name: "&[u8]" }) } fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { type_name: "Option" }) + Err(SerializerError::UnrankableType { type_name: "Option" }) } fn serialize_some(self, _value: &T) -> Result where T: Serialize, { - Err(SerializerError::UnserializableType { type_name: "Option" }) + Err(SerializerError::UnrankableType { type_name: "Option" }) } fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { type_name: "()" }) + Err(SerializerError::UnrankableType { type_name: "()" }) } fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { type_name: "unit struct" }) + Err(SerializerError::UnrankableType { type_name: "unit struct" }) } fn serialize_unit_variant( @@ -101,7 +101,7 @@ impl ser::Serializer for ConvertToNumber { _variant: &'static str ) -> Result { - Err(SerializerError::UnserializableType { type_name: "unit variant" }) + Err(SerializerError::UnrankableType { type_name: "unit variant" }) } fn serialize_newtype_struct( @@ -123,15 +123,15 @@ impl ser::Serializer for ConvertToNumber { ) -> Result where T: Serialize, { - Err(SerializerError::UnserializableType { type_name: "newtype variant" }) + Err(SerializerError::UnrankableType { type_name: "newtype variant" }) } fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { type_name: "sequence" }) + Err(SerializerError::UnrankableType { type_name: "sequence" }) } fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { type_name: "tuple" }) + Err(SerializerError::UnrankableType { type_name: "tuple" }) } fn serialize_tuple_struct( @@ -140,7 +140,7 @@ impl ser::Serializer for ConvertToNumber { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { type_name: "tuple struct" }) + Err(SerializerError::UnrankableType { type_name: "tuple struct" }) } fn serialize_tuple_variant( @@ -151,11 +151,11 @@ impl ser::Serializer for ConvertToNumber { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { type_name: "tuple variant" }) + Err(SerializerError::UnrankableType { type_name: "tuple variant" }) } fn serialize_map(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { type_name: "map" }) + Err(SerializerError::UnrankableType { type_name: "map" }) } fn serialize_struct( @@ -164,7 +164,7 @@ impl ser::Serializer for ConvertToNumber { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { type_name: "struct" }) + Err(SerializerError::UnrankableType { type_name: "struct" }) } fn serialize_struct_variant( @@ -175,6 +175,6 @@ impl ser::Serializer for ConvertToNumber { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { type_name: "struct variant" }) + Err(SerializerError::UnrankableType { type_name: "struct variant" }) } } diff --git a/meilidb-data/src/serde/mod.rs b/meilidb-data/src/serde/mod.rs index 75209c574..cf222c1bd 100644 --- a/meilidb-data/src/serde/mod.rs +++ b/meilidb-data/src/serde/mod.rs @@ -35,6 +35,7 @@ pub enum SerializerError { ParseNumberError(ParseNumberError), UnserializableType { type_name: &'static str }, UnindexableType { type_name: &'static str }, + UnrankableType { type_name: &'static str }, Custom(String), } @@ -61,6 +62,9 @@ impl fmt::Display for SerializerError { SerializerError::UnindexableType { type_name } => { write!(f, "{} are not an indexable type", type_name) }, + SerializerError::UnrankableType { type_name } => { + write!(f, "{} types can not be used for ranking", type_name) + }, SerializerError::Custom(s) => f.write_str(s), } }