mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 05:14:27 +01:00
chore: Move index related things to the meilidb-core workspace member
This commit is contained in:
parent
3056b351fa
commit
14790eeae3
60
Cargo.toml
60
Cargo.toml
@ -1,55 +1,5 @@
|
|||||||
[package]
|
[workspace]
|
||||||
edition = "2018"
|
members = [
|
||||||
name = "meilidb"
|
"meilidb",
|
||||||
version = "0.3.2"
|
"meilidb-core",
|
||||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
]
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
arc-swap = "0.3.7"
|
|
||||||
bincode = "1.1.2"
|
|
||||||
byteorder = "1.3.1"
|
|
||||||
fst = "0.3.3"
|
|
||||||
hashbrown = { version = "0.1.8", features = ["serde"] }
|
|
||||||
lazy_static = "1.2.0"
|
|
||||||
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
|
||||||
linked-hash-map = { version = "0.5.1", features = ["serde_impl"] }
|
|
||||||
lockfree = "0.5.1"
|
|
||||||
log = "0.4.6"
|
|
||||||
rayon = "1.0.3"
|
|
||||||
sdset = "0.3.1"
|
|
||||||
serde = "1.0.88"
|
|
||||||
serde_derive = "1.0.88"
|
|
||||||
serde_json = { version = "1.0.38", features = ["preserve_order"] }
|
|
||||||
size_format = "1.0.2"
|
|
||||||
slice-group-by = "0.2.4"
|
|
||||||
unidecode = "0.3.0"
|
|
||||||
|
|
||||||
[dependencies.toml]
|
|
||||||
git = "https://github.com/Kerollmops/toml-rs.git"
|
|
||||||
features = ["preserve_order"]
|
|
||||||
rev = "0372ba6"
|
|
||||||
|
|
||||||
[dependencies.rocksdb]
|
|
||||||
git = "https://github.com/pingcap/rust-rocksdb.git"
|
|
||||||
rev = "306e201"
|
|
||||||
|
|
||||||
[features]
|
|
||||||
default = ["simd"]
|
|
||||||
i128 = ["bincode/i128", "byteorder/i128"]
|
|
||||||
portable = ["rocksdb/portable"]
|
|
||||||
simd = ["rocksdb/sse"]
|
|
||||||
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]
|
|
||||||
|
|
||||||
[dev-dependencies]
|
|
||||||
csv = "1.0.5"
|
|
||||||
env_logger = "0.6.0"
|
|
||||||
jemallocator = "0.1.9"
|
|
||||||
quickcheck = "0.8.2"
|
|
||||||
rand = "0.6.5"
|
|
||||||
rand_xorshift = "0.1.1"
|
|
||||||
structopt = "0.2.14"
|
|
||||||
tempfile = "3.0.7"
|
|
||||||
termcolor = "1.0.4"
|
|
||||||
|
|
||||||
[profile.release]
|
|
||||||
debug = true
|
|
||||||
|
21
meilidb-core/Cargo.toml
Normal file
21
meilidb-core/Cargo.toml
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
[package]
|
||||||
|
name = "meilidb-core"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
byteorder = "1.3.1"
|
||||||
|
fst = "0.3.3"
|
||||||
|
hashbrown = "0.1.8"
|
||||||
|
lazy_static = "1.2.0"
|
||||||
|
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
||||||
|
log = "0.4.6"
|
||||||
|
rayon = "1.0.3"
|
||||||
|
sdset = "0.3.1"
|
||||||
|
serde = "1.0.88"
|
||||||
|
serde_derive = "1.0.88"
|
||||||
|
slice-group-by = "0.2.4"
|
||||||
|
|
||||||
|
[features]
|
||||||
|
i128 = ["byteorder/i128"]
|
@ -1,7 +1,6 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
|
use crate::criterion::Criterion;
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::RawDocument;
|
||||||
use crate::rank::RawDocument;
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct DocumentId;
|
pub struct DocumentId;
|
@ -1,9 +1,7 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
use crate::criterion::Criterion;
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::RawDocument;
|
||||||
use crate::rank::RawDocument;
|
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
|
fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
|
@ -4,11 +4,11 @@ mod words_proximity;
|
|||||||
mod sum_of_words_attribute;
|
mod sum_of_words_attribute;
|
||||||
mod sum_of_words_position;
|
mod sum_of_words_position;
|
||||||
mod exact;
|
mod exact;
|
||||||
mod sort_by_attr;
|
// mod sort_by_attr;
|
||||||
mod document_id;
|
mod document_id;
|
||||||
|
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use crate::rank::RawDocument;
|
use crate::RawDocument;
|
||||||
|
|
||||||
pub use self::{
|
pub use self::{
|
||||||
sum_of_typos::SumOfTypos,
|
sum_of_typos::SumOfTypos,
|
||||||
@ -17,7 +17,7 @@ pub use self::{
|
|||||||
sum_of_words_attribute::SumOfWordsAttribute,
|
sum_of_words_attribute::SumOfWordsAttribute,
|
||||||
sum_of_words_position::SumOfWordsPosition,
|
sum_of_words_position::SumOfWordsPosition,
|
||||||
exact::Exact,
|
exact::Exact,
|
||||||
sort_by_attr::SortByAttr,
|
// sort_by_attr::SortByAttr,
|
||||||
document_id::DocumentId,
|
document_id::DocumentId,
|
||||||
};
|
};
|
||||||
|
|
@ -1,9 +1,7 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
use crate::criterion::Criterion;
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::RawDocument;
|
||||||
use crate::rank::RawDocument;
|
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn number_of_query_words(query_index: &[u32]) -> usize {
|
fn number_of_query_words(query_index: &[u32]) -> usize {
|
@ -3,9 +3,9 @@ use std::error::Error;
|
|||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
use crate::database::schema::{Schema, SchemaAttr};
|
use crate::database::schema::{Schema, SchemaAttr};
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::criterion::Criterion;
|
||||||
use crate::database::RankedMap;
|
use crate::database::RankedMap;
|
||||||
use crate::rank::RawDocument;
|
use crate::RawDocument;
|
||||||
|
|
||||||
/// An helper struct that permit to sort documents by
|
/// An helper struct that permit to sort documents by
|
||||||
/// some of their stored attributes.
|
/// some of their stored attributes.
|
@ -2,8 +2,8 @@ use std::cmp::Ordering;
|
|||||||
|
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::criterion::Criterion;
|
||||||
use crate::rank::RawDocument;
|
use crate::RawDocument;
|
||||||
|
|
||||||
// This function is a wrong logarithmic 10 function.
|
// This function is a wrong logarithmic 10 function.
|
||||||
// It is safe to panic on input number higher than 3,
|
// It is safe to panic on input number higher than 3,
|
@ -1,9 +1,7 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
use crate::criterion::Criterion;
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::RawDocument;
|
||||||
use crate::rank::RawDocument;
|
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
|
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
|
@ -1,9 +1,7 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
use crate::criterion::Criterion;
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::RawDocument;
|
||||||
use crate::rank::RawDocument;
|
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
|
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
|
@ -1,9 +1,7 @@
|
|||||||
use std::cmp::{self, Ordering};
|
use std::cmp::{self, Ordering};
|
||||||
|
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
use crate::criterion::Criterion;
|
||||||
use crate::rank::criterion::Criterion;
|
use crate::RawDocument;
|
||||||
use crate::rank::RawDocument;
|
|
||||||
|
|
||||||
const MAX_DISTANCE: u16 = 8;
|
const MAX_DISTANCE: u16 = 8;
|
||||||
|
|
@ -1,16 +1,118 @@
|
|||||||
pub mod criterion;
|
pub mod criterion;
|
||||||
|
pub mod data;
|
||||||
|
mod index;
|
||||||
|
mod automaton;
|
||||||
mod query_builder;
|
mod query_builder;
|
||||||
mod distinct_map;
|
mod distinct_map;
|
||||||
|
|
||||||
|
pub mod shared_data_cursor;
|
||||||
|
pub mod write_to_bytes;
|
||||||
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use serde_derive::{Serialize, Deserialize};
|
||||||
|
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
use rayon::slice::ParallelSliceMut;
|
use rayon::slice::ParallelSliceMut;
|
||||||
|
|
||||||
use crate::{Match, DocumentId};
|
pub use self::index::{Index, IndexBuilder};
|
||||||
|
|
||||||
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
|
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
|
||||||
|
|
||||||
|
/// Represent an internally generated document unique identifier.
|
||||||
|
///
|
||||||
|
/// It is used to inform the database the document you want to deserialize.
|
||||||
|
/// Helpful for custom ranking.
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||||
|
pub struct DocumentId(pub u64);
|
||||||
|
|
||||||
|
/// This structure represent the position of a word
|
||||||
|
/// in a document and its attributes.
|
||||||
|
///
|
||||||
|
/// This is stored in the map, generated at index time,
|
||||||
|
/// extracted and interpreted at search time.
|
||||||
|
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
|
#[repr(C)]
|
||||||
|
pub struct DocIndex {
|
||||||
|
/// The document identifier where the word was found.
|
||||||
|
pub document_id: DocumentId,
|
||||||
|
|
||||||
|
/// The attribute in the document where the word was found
|
||||||
|
/// along with the index in it.
|
||||||
|
pub attribute: u16,
|
||||||
|
pub word_index: u16,
|
||||||
|
|
||||||
|
/// The position in bytes where the word was found
|
||||||
|
/// along with the length of it.
|
||||||
|
///
|
||||||
|
/// It informs on the original word area in the text indexed
|
||||||
|
/// without needing to run the tokenizer again.
|
||||||
|
pub char_index: u16,
|
||||||
|
pub char_length: u16,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This structure represent a matching word with informations
|
||||||
|
/// on the location of the word in the document.
|
||||||
|
///
|
||||||
|
/// The order of the field is important because it defines
|
||||||
|
/// the way these structures are ordered between themselves.
|
||||||
|
///
|
||||||
|
/// The word in itself is not important.
|
||||||
|
// TODO do data oriented programming ? very arrays ?
|
||||||
|
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
|
pub struct Match {
|
||||||
|
/// The word index in the query sentence.
|
||||||
|
/// Same as the `attribute_index` but for the query words.
|
||||||
|
///
|
||||||
|
/// Used to retrieve the automaton that match this word.
|
||||||
|
pub query_index: u32,
|
||||||
|
|
||||||
|
/// The distance the word has with the query word
|
||||||
|
/// (i.e. the Levenshtein distance).
|
||||||
|
pub distance: u8,
|
||||||
|
|
||||||
|
/// The attribute in the document where the word was found
|
||||||
|
/// along with the index in it.
|
||||||
|
pub attribute: u16,
|
||||||
|
pub word_index: u16,
|
||||||
|
|
||||||
|
/// Whether the word that match is an exact match or a prefix.
|
||||||
|
pub is_exact: bool,
|
||||||
|
|
||||||
|
/// The position in bytes where the word was found
|
||||||
|
/// along with the length of it.
|
||||||
|
///
|
||||||
|
/// It informs on the original word area in the text indexed
|
||||||
|
/// without needing to run the tokenizer again.
|
||||||
|
pub char_index: u16,
|
||||||
|
pub char_length: u16,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Match {
|
||||||
|
pub fn zero() -> Self {
|
||||||
|
Match {
|
||||||
|
query_index: 0,
|
||||||
|
distance: 0,
|
||||||
|
attribute: 0,
|
||||||
|
word_index: 0,
|
||||||
|
is_exact: false,
|
||||||
|
char_index: 0,
|
||||||
|
char_length: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn max() -> Self {
|
||||||
|
Match {
|
||||||
|
query_index: u32::max_value(),
|
||||||
|
distance: u8::max_value(),
|
||||||
|
attribute: u16::max_value(),
|
||||||
|
word_index: u16::max_value(),
|
||||||
|
is_exact: true,
|
||||||
|
char_index: u16::max_value(),
|
||||||
|
char_length: u16::max_value(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
pub struct Document {
|
pub struct Document {
|
||||||
pub id: DocumentId,
|
pub id: DocumentId,
|
||||||
@ -181,3 +283,15 @@ impl Matches {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use std::mem;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn docindex_mem_size() {
|
||||||
|
assert_eq!(mem::size_of::<DocIndex>(), 24);
|
||||||
|
}
|
||||||
|
}
|
@ -11,11 +11,23 @@ use fst::Streamer;
|
|||||||
use log::info;
|
use log::info;
|
||||||
|
|
||||||
use crate::automaton::{self, DfaExt, AutomatonExt};
|
use crate::automaton::{self, DfaExt, AutomatonExt};
|
||||||
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
|
use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
|
||||||
use crate::rank::criterion::Criteria;
|
use crate::criterion::Criteria;
|
||||||
use crate::database::Index;
|
use crate::{raw_documents_from_matches, RawDocument, Document};
|
||||||
use crate::rank::{raw_documents_from_matches, RawDocument, Document};
|
use crate::{Index, Match, DocumentId};
|
||||||
use crate::{is_cjk, Match, DocumentId};
|
|
||||||
|
// query splitting must move out of this crate
|
||||||
|
pub fn is_cjk(c: char) -> bool {
|
||||||
|
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
||||||
|
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
||||||
|
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
||||||
|
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
||||||
|
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
||||||
|
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
||||||
|
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
||||||
|
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
||||||
|
(c >= '\u{f900}' && c <= '\u{faff}')
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
enum CharCategory {
|
enum CharCategory {
|
1072
meilidb/Cargo.lock
generated
Normal file
1072
meilidb/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
50
meilidb/Cargo.toml
Normal file
50
meilidb/Cargo.toml
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
[package]
|
||||||
|
edition = "2018"
|
||||||
|
name = "meilidb"
|
||||||
|
version = "0.3.1"
|
||||||
|
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
arc-swap = "0.3.7"
|
||||||
|
bincode = "1.1.2"
|
||||||
|
byteorder = "1.3.1"
|
||||||
|
fst = "0.3.3"
|
||||||
|
hashbrown = { version = "0.1.8", features = ["serde"] }
|
||||||
|
linked-hash-map = { version = "0.5.1", features = ["serde_impl"] }
|
||||||
|
lockfree = "0.5.1"
|
||||||
|
log = "0.4.6"
|
||||||
|
sdset = "0.3.1"
|
||||||
|
serde = "1.0.88"
|
||||||
|
serde_derive = "1.0.88"
|
||||||
|
serde_json = { version = "1.0.38", features = ["preserve_order"] }
|
||||||
|
size_format = "1.0.2"
|
||||||
|
slice-group-by = "0.2.4"
|
||||||
|
unidecode = "0.3.0"
|
||||||
|
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
|
||||||
|
|
||||||
|
[dependencies.toml]
|
||||||
|
git = "https://github.com/Kerollmops/toml-rs.git"
|
||||||
|
features = ["preserve_order"]
|
||||||
|
rev = "0372ba6"
|
||||||
|
|
||||||
|
[dependencies.rocksdb]
|
||||||
|
git = "https://github.com/pingcap/rust-rocksdb.git"
|
||||||
|
rev = "306e201"
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = ["simd"]
|
||||||
|
i128 = ["bincode/i128"]
|
||||||
|
portable = ["rocksdb/portable"]
|
||||||
|
simd = ["rocksdb/sse"]
|
||||||
|
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
csv = "1.0.5"
|
||||||
|
env_logger = "0.6.0"
|
||||||
|
jemallocator = "0.1.9"
|
||||||
|
quickcheck = "0.8.2"
|
||||||
|
rand = "0.6.5"
|
||||||
|
rand_xorshift = "0.1.1"
|
||||||
|
structopt = "0.2.14"
|
||||||
|
tempfile = "3.0.7"
|
||||||
|
termcolor = "1.0.4"
|
@ -5,7 +5,7 @@ use std::fmt;
|
|||||||
use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt};
|
use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt};
|
||||||
|
|
||||||
use crate::database::schema::SchemaAttr;
|
use crate::database::schema::SchemaAttr;
|
||||||
use crate::DocumentId;
|
use meilidb_core::DocumentId;
|
||||||
|
|
||||||
const DOC_KEY_LEN: usize = 4 + size_of::<u64>();
|
const DOC_KEY_LEN: usize = 4 + size_of::<u64>();
|
||||||
const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u16>();
|
const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u16>();
|
@ -17,9 +17,9 @@ use hashbrown::HashMap;
|
|||||||
use log::{info, error, warn};
|
use log::{info, error, warn};
|
||||||
|
|
||||||
use crate::database::schema::SchemaAttr;
|
use crate::database::schema::SchemaAttr;
|
||||||
use crate::shared_data_cursor::FromSharedDataCursor;
|
use meilidb_core::shared_data_cursor::FromSharedDataCursor;
|
||||||
use crate::write_to_bytes::WriteToBytes;
|
use meilidb_core::write_to_bytes::WriteToBytes;
|
||||||
use crate::DocumentId;
|
use meilidb_core::{Index, DocumentId};
|
||||||
|
|
||||||
use self::update::{ReadIndexEvent, ReadRankedMapEvent};
|
use self::update::{ReadIndexEvent, ReadRankedMapEvent};
|
||||||
|
|
||||||
@ -29,7 +29,6 @@ pub use self::view::{DatabaseView, DocumentIter};
|
|||||||
pub use self::update::Update;
|
pub use self::update::Update;
|
||||||
pub use self::serde::SerializerError;
|
pub use self::serde::SerializerError;
|
||||||
pub use self::schema::Schema;
|
pub use self::schema::Schema;
|
||||||
pub use self::index::Index;
|
|
||||||
pub use self::number::{Number, ParseNumberError};
|
pub use self::number::{Number, ParseNumberError};
|
||||||
|
|
||||||
pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>;
|
pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>;
|
||||||
@ -41,7 +40,6 @@ const CONFIG: &[u8] = b"config";
|
|||||||
|
|
||||||
pub mod config;
|
pub mod config;
|
||||||
pub mod schema;
|
pub mod schema;
|
||||||
pub(crate) mod index;
|
|
||||||
mod number;
|
mod number;
|
||||||
mod document_key;
|
mod document_key;
|
||||||
mod serde;
|
mod serde;
|
@ -10,7 +10,7 @@ use linked_hash_map::LinkedHashMap;
|
|||||||
|
|
||||||
use crate::database::serde::find_id::FindDocumentIdSerializer;
|
use crate::database::serde::find_id::FindDocumentIdSerializer;
|
||||||
use crate::database::serde::SerializerError;
|
use crate::database::serde::SerializerError;
|
||||||
use crate::DocumentId;
|
use meilidb_core::DocumentId;
|
||||||
|
|
||||||
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false };
|
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false };
|
||||||
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false };
|
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false };
|
@ -10,7 +10,7 @@ use serde::de::{self, Visitor, IntoDeserializer};
|
|||||||
|
|
||||||
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
||||||
use crate::database::schema::Schema;
|
use crate::database::schema::Schema;
|
||||||
use crate::DocumentId;
|
use meilidb_core::DocumentId;
|
||||||
|
|
||||||
pub struct Deserializer<'a, D>
|
pub struct Deserializer<'a, D>
|
||||||
where D: Deref<Target=DB>
|
where D: Deref<Target=DB>
|
@ -3,7 +3,7 @@ use serde::ser;
|
|||||||
|
|
||||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||||
use crate::database::serde::{SerializerError, calculate_hash};
|
use crate::database::serde::{SerializerError, calculate_hash};
|
||||||
use crate::DocumentId;
|
use meilidb_core::DocumentId;
|
||||||
|
|
||||||
pub struct FindDocumentIdSerializer<'a> {
|
pub struct FindDocumentIdSerializer<'a> {
|
||||||
pub id_attribute_name: &'a str,
|
pub id_attribute_name: &'a str,
|
@ -2,13 +2,14 @@ use std::collections::HashSet;
|
|||||||
|
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde::ser;
|
use serde::ser;
|
||||||
|
use meilidb_core::{DocumentId, DocIndex};
|
||||||
|
|
||||||
use crate::database::update::DocumentUpdate;
|
use crate::database::update::DocumentUpdate;
|
||||||
use crate::database::serde::SerializerError;
|
use crate::database::serde::SerializerError;
|
||||||
use crate::database::schema::SchemaAttr;
|
use crate::database::schema::SchemaAttr;
|
||||||
use crate::tokenizer::TokenizerBuilder;
|
use crate::tokenizer::TokenizerBuilder;
|
||||||
use crate::tokenizer::Token;
|
use crate::tokenizer::Token;
|
||||||
use crate::{is_cjk, DocumentId, DocIndex};
|
use crate::is_cjk;
|
||||||
|
|
||||||
pub struct IndexerSerializer<'a, 'b, B> {
|
pub struct IndexerSerializer<'a, 'b, B> {
|
||||||
pub tokenizer_builder: &'a B,
|
pub tokenizer_builder: &'a B,
|
@ -10,7 +10,7 @@ use crate::database::update::DocumentUpdate;
|
|||||||
use crate::database::serde::SerializerError;
|
use crate::database::serde::SerializerError;
|
||||||
use crate::tokenizer::TokenizerBuilder;
|
use crate::tokenizer::TokenizerBuilder;
|
||||||
use crate::database::schema::Schema;
|
use crate::database::schema::Schema;
|
||||||
use crate::DocumentId;
|
use meilidb_core::DocumentId;
|
||||||
|
|
||||||
pub struct Serializer<'a, 'b, B> {
|
pub struct Serializer<'a, 'b, B> {
|
||||||
pub schema: &'a Schema,
|
pub schema: &'a Schema,
|
@ -1,11 +1,11 @@
|
|||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
|
|
||||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||||
|
use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||||
|
use meilidb_core::write_to_bytes::WriteToBytes;
|
||||||
|
use meilidb_core::data::DocIds;
|
||||||
|
|
||||||
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
|
||||||
use crate::write_to_bytes::WriteToBytes;
|
|
||||||
use crate::database::Index;
|
use crate::database::Index;
|
||||||
use crate::data::DocIds;
|
|
||||||
|
|
||||||
pub enum WriteIndexEvent<'a> {
|
pub enum WriteIndexEvent<'a> {
|
||||||
RemovedDocuments(&'a DocIds),
|
RemovedDocuments(&'a DocIds),
|
@ -5,19 +5,18 @@ use rocksdb::rocksdb::{Writable, WriteBatch};
|
|||||||
use hashbrown::hash_map::HashMap;
|
use hashbrown::hash_map::HashMap;
|
||||||
use sdset::{Set, SetBuf};
|
use sdset::{Set, SetBuf};
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
|
use meilidb_core::write_to_bytes::WriteToBytes;
|
||||||
|
use meilidb_core::data::DocIds;
|
||||||
|
use meilidb_core::{IndexBuilder, DocumentId, DocIndex};
|
||||||
|
|
||||||
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
||||||
use crate::database::serde::serializer::Serializer;
|
use crate::database::serde::serializer::Serializer;
|
||||||
use crate::database::serde::SerializerError;
|
use crate::database::serde::SerializerError;
|
||||||
use crate::database::schema::SchemaAttr;
|
use crate::database::schema::SchemaAttr;
|
||||||
use crate::database::schema::Schema;
|
use crate::database::schema::Schema;
|
||||||
use crate::database::index::IndexBuilder;
|
|
||||||
use crate::database::{DATA_INDEX, DATA_RANKED_MAP};
|
use crate::database::{DATA_INDEX, DATA_RANKED_MAP};
|
||||||
use crate::database::{RankedMap, Number};
|
use crate::database::{RankedMap, Number};
|
||||||
use crate::tokenizer::TokenizerBuilder;
|
use crate::tokenizer::TokenizerBuilder;
|
||||||
use crate::write_to_bytes::WriteToBytes;
|
|
||||||
use crate::data::DocIds;
|
|
||||||
use crate::{DocumentId, DocIndex};
|
|
||||||
|
|
||||||
pub use self::index_event::{ReadIndexEvent, WriteIndexEvent};
|
pub use self::index_event::{ReadIndexEvent, WriteIndexEvent};
|
||||||
pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent};
|
pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent};
|
@ -1,11 +1,11 @@
|
|||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
|
|
||||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||||
|
use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||||
|
use meilidb_core::write_to_bytes::WriteToBytes;
|
||||||
|
use meilidb_core::data::DocIds;
|
||||||
|
|
||||||
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
|
||||||
use crate::write_to_bytes::WriteToBytes;
|
|
||||||
use crate::database::RankedMap;
|
use crate::database::RankedMap;
|
||||||
use crate::data::DocIds;
|
|
||||||
|
|
||||||
pub enum WriteRankedMapEvent<'a> {
|
pub enum WriteRankedMapEvent<'a> {
|
||||||
RemovedDocuments(&'a DocIds),
|
RemovedDocuments(&'a DocIds),
|
@ -6,16 +6,15 @@ use std::{fmt, marker};
|
|||||||
use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions};
|
use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions};
|
||||||
use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter};
|
use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter};
|
||||||
use serde::de::DeserializeOwned;
|
use serde::de::DeserializeOwned;
|
||||||
|
use meilidb_core::{Index, QueryBuilder, FilterFunc};
|
||||||
|
use meilidb_core::DocumentId;
|
||||||
|
|
||||||
use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map, retrieve_config};
|
use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map, retrieve_config};
|
||||||
use crate::database::serde::deserializer::Deserializer;
|
use crate::database::serde::deserializer::Deserializer;
|
||||||
use crate::database::{DocumentKey, DocumentKeyAttr};
|
use crate::database::{DocumentKey, DocumentKeyAttr};
|
||||||
use crate::rank::{QueryBuilder, FilterFunc};
|
|
||||||
use crate::database::schema::Schema;
|
use crate::database::schema::Schema;
|
||||||
use crate::database::index::Index;
|
|
||||||
use crate::database::RankedMap;
|
use crate::database::RankedMap;
|
||||||
use crate::database::Config;
|
use crate::database::Config;
|
||||||
use crate::DocumentId;
|
|
||||||
|
|
||||||
pub struct DatabaseView<D>
|
pub struct DatabaseView<D>
|
||||||
where D: Deref<Target=DB>
|
where D: Deref<Target=DB>
|
22
meilidb/src/lib.rs
Normal file
22
meilidb/src/lib.rs
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
#![cfg_attr(feature = "nightly", feature(test))]
|
||||||
|
|
||||||
|
pub mod database;
|
||||||
|
pub mod tokenizer;
|
||||||
|
mod common_words;
|
||||||
|
|
||||||
|
pub use rocksdb;
|
||||||
|
|
||||||
|
pub use self::tokenizer::Tokenizer;
|
||||||
|
pub use self::common_words::CommonWords;
|
||||||
|
|
||||||
|
pub fn is_cjk(c: char) -> bool {
|
||||||
|
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
||||||
|
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
||||||
|
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
||||||
|
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
||||||
|
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
||||||
|
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
||||||
|
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
||||||
|
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
||||||
|
(c >= '\u{f900}' && c <= '\u{faff}')
|
||||||
|
}
|
136
src/lib.rs
136
src/lib.rs
@ -1,136 +0,0 @@
|
|||||||
#![cfg_attr(feature = "nightly", feature(test))]
|
|
||||||
|
|
||||||
pub mod automaton;
|
|
||||||
pub mod database;
|
|
||||||
pub mod data;
|
|
||||||
pub mod rank;
|
|
||||||
pub mod tokenizer;
|
|
||||||
mod common_words;
|
|
||||||
mod shared_data_cursor;
|
|
||||||
mod write_to_bytes;
|
|
||||||
|
|
||||||
use serde_derive::{Serialize, Deserialize};
|
|
||||||
|
|
||||||
pub use rocksdb;
|
|
||||||
|
|
||||||
pub use self::tokenizer::Tokenizer;
|
|
||||||
pub use self::common_words::CommonWords;
|
|
||||||
|
|
||||||
pub fn is_cjk(c: char) -> bool {
|
|
||||||
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
|
||||||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
|
||||||
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
|
||||||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
|
||||||
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
|
||||||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
|
||||||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
|
||||||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
|
||||||
(c >= '\u{f900}' && c <= '\u{faff}')
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Represent an internally generated document unique identifier.
|
|
||||||
///
|
|
||||||
/// It is used to inform the database the document you want to deserialize.
|
|
||||||
/// Helpful for custom ranking.
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
|
||||||
pub struct DocumentId(u64);
|
|
||||||
|
|
||||||
/// This structure represent the position of a word
|
|
||||||
/// in a document and its attributes.
|
|
||||||
///
|
|
||||||
/// This is stored in the map, generated at index time,
|
|
||||||
/// extracted and interpreted at search time.
|
|
||||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
|
||||||
#[repr(C)]
|
|
||||||
pub struct DocIndex {
|
|
||||||
/// The document identifier where the word was found.
|
|
||||||
pub document_id: DocumentId,
|
|
||||||
|
|
||||||
/// The attribute in the document where the word was found
|
|
||||||
/// along with the index in it.
|
|
||||||
pub attribute: u16,
|
|
||||||
pub word_index: u16,
|
|
||||||
|
|
||||||
/// The position in bytes where the word was found
|
|
||||||
/// along with the length of it.
|
|
||||||
///
|
|
||||||
/// It informs on the original word area in the text indexed
|
|
||||||
/// without needing to run the tokenizer again.
|
|
||||||
pub char_index: u16,
|
|
||||||
pub char_length: u16,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This structure represent a matching word with informations
|
|
||||||
/// on the location of the word in the document.
|
|
||||||
///
|
|
||||||
/// The order of the field is important because it defines
|
|
||||||
/// the way these structures are ordered between themselves.
|
|
||||||
///
|
|
||||||
/// The word in itself is not important.
|
|
||||||
// TODO do data oriented programming ? very arrays ?
|
|
||||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
|
||||||
pub struct Match {
|
|
||||||
/// The word index in the query sentence.
|
|
||||||
/// Same as the `attribute_index` but for the query words.
|
|
||||||
///
|
|
||||||
/// Used to retrieve the automaton that match this word.
|
|
||||||
pub query_index: u32,
|
|
||||||
|
|
||||||
/// The distance the word has with the query word
|
|
||||||
/// (i.e. the Levenshtein distance).
|
|
||||||
pub distance: u8,
|
|
||||||
|
|
||||||
/// The attribute in the document where the word was found
|
|
||||||
/// along with the index in it.
|
|
||||||
pub attribute: u16,
|
|
||||||
pub word_index: u16,
|
|
||||||
|
|
||||||
/// Whether the word that match is an exact match or a prefix.
|
|
||||||
pub is_exact: bool,
|
|
||||||
|
|
||||||
/// The position in bytes where the word was found
|
|
||||||
/// along with the length of it.
|
|
||||||
///
|
|
||||||
/// It informs on the original word area in the text indexed
|
|
||||||
/// without needing to run the tokenizer again.
|
|
||||||
pub char_index: u16,
|
|
||||||
pub char_length: u16,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Match {
|
|
||||||
pub fn zero() -> Self {
|
|
||||||
Match {
|
|
||||||
query_index: 0,
|
|
||||||
distance: 0,
|
|
||||||
attribute: 0,
|
|
||||||
word_index: 0,
|
|
||||||
is_exact: false,
|
|
||||||
char_index: 0,
|
|
||||||
char_length: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn max() -> Self {
|
|
||||||
Match {
|
|
||||||
query_index: u32::max_value(),
|
|
||||||
distance: u8::max_value(),
|
|
||||||
attribute: u16::max_value(),
|
|
||||||
word_index: u16::max_value(),
|
|
||||||
is_exact: true,
|
|
||||||
char_index: u16::max_value(),
|
|
||||||
char_length: u16::max_value(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
use std::mem;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn docindex_mem_size() {
|
|
||||||
assert_eq!(mem::size_of::<DocIndex>(), 16);
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user