chore: Move index related things to the meilidb-core workspace member

This commit is contained in:
Clément Renault 2019-02-24 19:44:24 +01:00
parent 3056b351fa
commit 14790eeae3
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
44 changed files with 1343 additions and 252 deletions

View File

@ -1,55 +1,5 @@
[package]
edition = "2018"
name = "meilidb"
version = "0.3.2"
authors = ["Kerollmops <renault.cle@gmail.com>"]
[dependencies]
arc-swap = "0.3.7"
bincode = "1.1.2"
byteorder = "1.3.1"
fst = "0.3.3"
hashbrown = { version = "0.1.8", features = ["serde"] }
lazy_static = "1.2.0"
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
linked-hash-map = { version = "0.5.1", features = ["serde_impl"] }
lockfree = "0.5.1"
log = "0.4.6"
rayon = "1.0.3"
sdset = "0.3.1"
serde = "1.0.88"
serde_derive = "1.0.88"
serde_json = { version = "1.0.38", features = ["preserve_order"] }
size_format = "1.0.2"
slice-group-by = "0.2.4"
unidecode = "0.3.0"
[dependencies.toml]
git = "https://github.com/Kerollmops/toml-rs.git"
features = ["preserve_order"]
rev = "0372ba6"
[dependencies.rocksdb]
git = "https://github.com/pingcap/rust-rocksdb.git"
rev = "306e201"
[features]
default = ["simd"]
i128 = ["bincode/i128", "byteorder/i128"]
portable = ["rocksdb/portable"]
simd = ["rocksdb/sse"]
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]
[dev-dependencies]
csv = "1.0.5"
env_logger = "0.6.0"
jemallocator = "0.1.9"
quickcheck = "0.8.2"
rand = "0.6.5"
rand_xorshift = "0.1.1"
structopt = "0.2.14"
tempfile = "3.0.7"
termcolor = "1.0.4"
[profile.release]
debug = true
[workspace]
members = [
"meilidb",
"meilidb-core",
]

21
meilidb-core/Cargo.toml Normal file
View File

@ -0,0 +1,21 @@
[package]
name = "meilidb-core"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
byteorder = "1.3.1"
fst = "0.3.3"
hashbrown = "0.1.8"
lazy_static = "1.2.0"
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
log = "0.4.6"
rayon = "1.0.3"
sdset = "0.3.1"
serde = "1.0.88"
serde_derive = "1.0.88"
slice-group-by = "0.2.4"
[features]
i128 = ["byteorder/i128"]

View File

@ -1,7 +1,6 @@
use std::cmp::Ordering;
use crate::rank::criterion::Criterion;
use crate::rank::RawDocument;
use crate::criterion::Criterion;
use crate::RawDocument;
#[derive(Debug, Clone, Copy)]
pub struct DocumentId;

View File

@ -1,9 +1,7 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::rank::criterion::Criterion;
use crate::rank::RawDocument;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {

View File

@ -4,11 +4,11 @@ mod words_proximity;
mod sum_of_words_attribute;
mod sum_of_words_position;
mod exact;
mod sort_by_attr;
// mod sort_by_attr;
mod document_id;
use std::cmp::Ordering;
use crate::rank::RawDocument;
use crate::RawDocument;
pub use self::{
sum_of_typos::SumOfTypos,
@ -17,7 +17,7 @@ pub use self::{
sum_of_words_attribute::SumOfWordsAttribute,
sum_of_words_position::SumOfWordsPosition,
exact::Exact,
sort_by_attr::SortByAttr,
// sort_by_attr::SortByAttr,
document_id::DocumentId,
};

View File

@ -1,9 +1,7 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::rank::criterion::Criterion;
use crate::rank::RawDocument;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn number_of_query_words(query_index: &[u32]) -> usize {

View File

@ -3,9 +3,9 @@ use std::error::Error;
use std::fmt;
use crate::database::schema::{Schema, SchemaAttr};
use crate::rank::criterion::Criterion;
use crate::criterion::Criterion;
use crate::database::RankedMap;
use crate::rank::RawDocument;
use crate::RawDocument;
/// An helper struct that permit to sort documents by
/// some of their stored attributes.

View File

@ -2,8 +2,8 @@ use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::rank::criterion::Criterion;
use crate::rank::RawDocument;
use crate::criterion::Criterion;
use crate::RawDocument;
// This function is a wrong logarithmic 10 function.
// It is safe to panic on input number higher than 3,

View File

@ -1,9 +1,7 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::rank::criterion::Criterion;
use crate::rank::RawDocument;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {

View File

@ -1,9 +1,7 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::rank::criterion::Criterion;
use crate::rank::RawDocument;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {

View File

@ -1,9 +1,7 @@
use std::cmp::{self, Ordering};
use slice_group_by::GroupBy;
use crate::rank::criterion::Criterion;
use crate::rank::RawDocument;
use crate::criterion::Criterion;
use crate::RawDocument;
const MAX_DISTANCE: u16 = 8;

View File

@ -1,16 +1,118 @@
pub mod criterion;
pub mod data;
mod index;
mod automaton;
mod query_builder;
mod distinct_map;
pub mod shared_data_cursor;
pub mod write_to_bytes;
use std::sync::Arc;
use serde_derive::{Serialize, Deserialize};
use slice_group_by::GroupBy;
use rayon::slice::ParallelSliceMut;
use crate::{Match, DocumentId};
pub use self::index::{Index, IndexBuilder};
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct DocumentId(pub u64);
/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
pub document_id: DocumentId,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
///
/// The word in itself is not important.
// TODO do data oriented programming ? very arrays ?
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Match {
/// The word index in the query sentence.
/// Same as the `attribute_index` but for the query words.
///
/// Used to retrieve the automaton that match this word.
pub query_index: u32,
/// The distance the word has with the query word
/// (i.e. the Levenshtein distance).
pub distance: u8,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// Whether the word that match is an exact match or a prefix.
pub is_exact: bool,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
impl Match {
pub fn zero() -> Self {
Match {
query_index: 0,
distance: 0,
attribute: 0,
word_index: 0,
is_exact: false,
char_index: 0,
char_length: 0,
}
}
pub fn max() -> Self {
Match {
query_index: u32::max_value(),
distance: u8::max_value(),
attribute: u16::max_value(),
word_index: u16::max_value(),
is_exact: true,
char_index: u16::max_value(),
char_length: u16::max_value(),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Document {
pub id: DocumentId,
@ -181,3 +283,15 @@ impl Matches {
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 24);
}
}

View File

@ -11,11 +11,23 @@ use fst::Streamer;
use log::info;
use crate::automaton::{self, DfaExt, AutomatonExt};
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
use crate::rank::criterion::Criteria;
use crate::database::Index;
use crate::rank::{raw_documents_from_matches, RawDocument, Document};
use crate::{is_cjk, Match, DocumentId};
use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
use crate::criterion::Criteria;
use crate::{raw_documents_from_matches, RawDocument, Document};
use crate::{Index, Match, DocumentId};
// query splitting must move out of this crate
pub fn is_cjk(c: char) -> bool {
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
(c >= '\u{3040}' && c <= '\u{309f}') ||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
(c >= '\u{3100}' && c <= '\u{312f}') ||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
(c >= '\u{f900}' && c <= '\u{faff}')
}
#[derive(Debug, PartialEq, Eq)]
enum CharCategory {

1072
meilidb/Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

50
meilidb/Cargo.toml Normal file
View File

@ -0,0 +1,50 @@
[package]
edition = "2018"
name = "meilidb"
version = "0.3.1"
authors = ["Kerollmops <renault.cle@gmail.com>"]
[dependencies]
arc-swap = "0.3.7"
bincode = "1.1.2"
byteorder = "1.3.1"
fst = "0.3.3"
hashbrown = { version = "0.1.8", features = ["serde"] }
linked-hash-map = { version = "0.5.1", features = ["serde_impl"] }
lockfree = "0.5.1"
log = "0.4.6"
sdset = "0.3.1"
serde = "1.0.88"
serde_derive = "1.0.88"
serde_json = { version = "1.0.38", features = ["preserve_order"] }
size_format = "1.0.2"
slice-group-by = "0.2.4"
unidecode = "0.3.0"
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
[dependencies.toml]
git = "https://github.com/Kerollmops/toml-rs.git"
features = ["preserve_order"]
rev = "0372ba6"
[dependencies.rocksdb]
git = "https://github.com/pingcap/rust-rocksdb.git"
rev = "306e201"
[features]
default = ["simd"]
i128 = ["bincode/i128"]
portable = ["rocksdb/portable"]
simd = ["rocksdb/sse"]
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]
[dev-dependencies]
csv = "1.0.5"
env_logger = "0.6.0"
jemallocator = "0.1.9"
quickcheck = "0.8.2"
rand = "0.6.5"
rand_xorshift = "0.1.1"
structopt = "0.2.14"
tempfile = "3.0.7"
termcolor = "1.0.4"

View File

@ -5,7 +5,7 @@ use std::fmt;
use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt};
use crate::database::schema::SchemaAttr;
use crate::DocumentId;
use meilidb_core::DocumentId;
const DOC_KEY_LEN: usize = 4 + size_of::<u64>();
const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u16>();

View File

@ -17,9 +17,9 @@ use hashbrown::HashMap;
use log::{info, error, warn};
use crate::database::schema::SchemaAttr;
use crate::shared_data_cursor::FromSharedDataCursor;
use crate::write_to_bytes::WriteToBytes;
use crate::DocumentId;
use meilidb_core::shared_data_cursor::FromSharedDataCursor;
use meilidb_core::write_to_bytes::WriteToBytes;
use meilidb_core::{Index, DocumentId};
use self::update::{ReadIndexEvent, ReadRankedMapEvent};
@ -29,7 +29,6 @@ pub use self::view::{DatabaseView, DocumentIter};
pub use self::update::Update;
pub use self::serde::SerializerError;
pub use self::schema::Schema;
pub use self::index::Index;
pub use self::number::{Number, ParseNumberError};
pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>;
@ -41,7 +40,6 @@ const CONFIG: &[u8] = b"config";
pub mod config;
pub mod schema;
pub(crate) mod index;
mod number;
mod document_key;
mod serde;

View File

@ -10,7 +10,7 @@ use linked_hash_map::LinkedHashMap;
use crate::database::serde::find_id::FindDocumentIdSerializer;
use crate::database::serde::SerializerError;
use crate::DocumentId;
use meilidb_core::DocumentId;
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false };
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false };

View File

@ -10,7 +10,7 @@ use serde::de::{self, Visitor, IntoDeserializer};
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
use crate::database::schema::Schema;
use crate::DocumentId;
use meilidb_core::DocumentId;
pub struct Deserializer<'a, D>
where D: Deref<Target=DB>

View File

@ -3,7 +3,7 @@ use serde::ser;
use crate::database::serde::key_to_string::KeyToStringSerializer;
use crate::database::serde::{SerializerError, calculate_hash};
use crate::DocumentId;
use meilidb_core::DocumentId;
pub struct FindDocumentIdSerializer<'a> {
pub id_attribute_name: &'a str,

View File

@ -2,13 +2,14 @@ use std::collections::HashSet;
use serde::Serialize;
use serde::ser;
use meilidb_core::{DocumentId, DocIndex};
use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError;
use crate::database::schema::SchemaAttr;
use crate::tokenizer::TokenizerBuilder;
use crate::tokenizer::Token;
use crate::{is_cjk, DocumentId, DocIndex};
use crate::is_cjk;
pub struct IndexerSerializer<'a, 'b, B> {
pub tokenizer_builder: &'a B,

View File

@ -10,7 +10,7 @@ use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError;
use crate::tokenizer::TokenizerBuilder;
use crate::database::schema::Schema;
use crate::DocumentId;
use meilidb_core::DocumentId;
pub struct Serializer<'a, 'b, B> {
pub schema: &'a Schema,

View File

@ -1,11 +1,11 @@
use std::error::Error;
use byteorder::{ReadBytesExt, WriteBytesExt};
use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
use meilidb_core::write_to_bytes::WriteToBytes;
use meilidb_core::data::DocIds;
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
use crate::write_to_bytes::WriteToBytes;
use crate::database::Index;
use crate::data::DocIds;
pub enum WriteIndexEvent<'a> {
RemovedDocuments(&'a DocIds),

View File

@ -5,19 +5,18 @@ use rocksdb::rocksdb::{Writable, WriteBatch};
use hashbrown::hash_map::HashMap;
use sdset::{Set, SetBuf};
use serde::Serialize;
use meilidb_core::write_to_bytes::WriteToBytes;
use meilidb_core::data::DocIds;
use meilidb_core::{IndexBuilder, DocumentId, DocIndex};
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
use crate::database::serde::serializer::Serializer;
use crate::database::serde::SerializerError;
use crate::database::schema::SchemaAttr;
use crate::database::schema::Schema;
use crate::database::index::IndexBuilder;
use crate::database::{DATA_INDEX, DATA_RANKED_MAP};
use crate::database::{RankedMap, Number};
use crate::tokenizer::TokenizerBuilder;
use crate::write_to_bytes::WriteToBytes;
use crate::data::DocIds;
use crate::{DocumentId, DocIndex};
pub use self::index_event::{ReadIndexEvent, WriteIndexEvent};
pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent};

View File

@ -1,11 +1,11 @@
use std::error::Error;
use byteorder::{ReadBytesExt, WriteBytesExt};
use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
use meilidb_core::write_to_bytes::WriteToBytes;
use meilidb_core::data::DocIds;
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
use crate::write_to_bytes::WriteToBytes;
use crate::database::RankedMap;
use crate::data::DocIds;
pub enum WriteRankedMapEvent<'a> {
RemovedDocuments(&'a DocIds),

View File

@ -6,16 +6,15 @@ use std::{fmt, marker};
use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions};
use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter};
use serde::de::DeserializeOwned;
use meilidb_core::{Index, QueryBuilder, FilterFunc};
use meilidb_core::DocumentId;
use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map, retrieve_config};
use crate::database::serde::deserializer::Deserializer;
use crate::database::{DocumentKey, DocumentKeyAttr};
use crate::rank::{QueryBuilder, FilterFunc};
use crate::database::schema::Schema;
use crate::database::index::Index;
use crate::database::RankedMap;
use crate::database::Config;
use crate::DocumentId;
pub struct DatabaseView<D>
where D: Deref<Target=DB>

22
meilidb/src/lib.rs Normal file
View File

@ -0,0 +1,22 @@
#![cfg_attr(feature = "nightly", feature(test))]
pub mod database;
pub mod tokenizer;
mod common_words;
pub use rocksdb;
pub use self::tokenizer::Tokenizer;
pub use self::common_words::CommonWords;
pub fn is_cjk(c: char) -> bool {
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
(c >= '\u{3040}' && c <= '\u{309f}') ||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
(c >= '\u{3100}' && c <= '\u{312f}') ||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
(c >= '\u{f900}' && c <= '\u{faff}')
}

View File

@ -1,136 +0,0 @@
#![cfg_attr(feature = "nightly", feature(test))]
pub mod automaton;
pub mod database;
pub mod data;
pub mod rank;
pub mod tokenizer;
mod common_words;
mod shared_data_cursor;
mod write_to_bytes;
use serde_derive::{Serialize, Deserialize};
pub use rocksdb;
pub use self::tokenizer::Tokenizer;
pub use self::common_words::CommonWords;
pub fn is_cjk(c: char) -> bool {
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
(c >= '\u{3040}' && c <= '\u{309f}') ||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
(c >= '\u{3100}' && c <= '\u{312f}') ||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
(c >= '\u{f900}' && c <= '\u{faff}')
}
/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct DocumentId(u64);
/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
pub document_id: DocumentId,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
///
/// The word in itself is not important.
// TODO do data oriented programming ? very arrays ?
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Match {
/// The word index in the query sentence.
/// Same as the `attribute_index` but for the query words.
///
/// Used to retrieve the automaton that match this word.
pub query_index: u32,
/// The distance the word has with the query word
/// (i.e. the Levenshtein distance).
pub distance: u8,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// Whether the word that match is an exact match or a prefix.
pub is_exact: bool,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
impl Match {
pub fn zero() -> Self {
Match {
query_index: 0,
distance: 0,
attribute: 0,
word_index: 0,
is_exact: false,
char_index: 0,
char_length: 0,
}
}
pub fn max() -> Self {
Match {
query_index: u32::max_value(),
distance: u8::max_value(),
attribute: u16::max_value(),
word_index: u16::max_value(),
is_exact: true,
char_index: u16::max_value(),
char_length: u16::max_value(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 16);
}
}