mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 05:14:27 +01:00
chore: Move index related things to the meilidb-core workspace member
This commit is contained in:
parent
3056b351fa
commit
14790eeae3
60
Cargo.toml
60
Cargo.toml
@ -1,55 +1,5 @@
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "meilidb"
|
||||
version = "0.3.2"
|
||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
|
||||
[dependencies]
|
||||
arc-swap = "0.3.7"
|
||||
bincode = "1.1.2"
|
||||
byteorder = "1.3.1"
|
||||
fst = "0.3.3"
|
||||
hashbrown = { version = "0.1.8", features = ["serde"] }
|
||||
lazy_static = "1.2.0"
|
||||
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
||||
linked-hash-map = { version = "0.5.1", features = ["serde_impl"] }
|
||||
lockfree = "0.5.1"
|
||||
log = "0.4.6"
|
||||
rayon = "1.0.3"
|
||||
sdset = "0.3.1"
|
||||
serde = "1.0.88"
|
||||
serde_derive = "1.0.88"
|
||||
serde_json = { version = "1.0.38", features = ["preserve_order"] }
|
||||
size_format = "1.0.2"
|
||||
slice-group-by = "0.2.4"
|
||||
unidecode = "0.3.0"
|
||||
|
||||
[dependencies.toml]
|
||||
git = "https://github.com/Kerollmops/toml-rs.git"
|
||||
features = ["preserve_order"]
|
||||
rev = "0372ba6"
|
||||
|
||||
[dependencies.rocksdb]
|
||||
git = "https://github.com/pingcap/rust-rocksdb.git"
|
||||
rev = "306e201"
|
||||
|
||||
[features]
|
||||
default = ["simd"]
|
||||
i128 = ["bincode/i128", "byteorder/i128"]
|
||||
portable = ["rocksdb/portable"]
|
||||
simd = ["rocksdb/sse"]
|
||||
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]
|
||||
|
||||
[dev-dependencies]
|
||||
csv = "1.0.5"
|
||||
env_logger = "0.6.0"
|
||||
jemallocator = "0.1.9"
|
||||
quickcheck = "0.8.2"
|
||||
rand = "0.6.5"
|
||||
rand_xorshift = "0.1.1"
|
||||
structopt = "0.2.14"
|
||||
tempfile = "3.0.7"
|
||||
termcolor = "1.0.4"
|
||||
|
||||
[profile.release]
|
||||
debug = true
|
||||
[workspace]
|
||||
members = [
|
||||
"meilidb",
|
||||
"meilidb-core",
|
||||
]
|
||||
|
21
meilidb-core/Cargo.toml
Normal file
21
meilidb-core/Cargo.toml
Normal file
@ -0,0 +1,21 @@
|
||||
[package]
|
||||
name = "meilidb-core"
|
||||
version = "0.1.0"
|
||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.3.1"
|
||||
fst = "0.3.3"
|
||||
hashbrown = "0.1.8"
|
||||
lazy_static = "1.2.0"
|
||||
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
||||
log = "0.4.6"
|
||||
rayon = "1.0.3"
|
||||
sdset = "0.3.1"
|
||||
serde = "1.0.88"
|
||||
serde_derive = "1.0.88"
|
||||
slice-group-by = "0.2.4"
|
||||
|
||||
[features]
|
||||
i128 = ["byteorder/i128"]
|
@ -1,7 +1,6 @@
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::rank::RawDocument;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct DocumentId;
|
@ -1,9 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::rank::RawDocument;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
|
@ -4,11 +4,11 @@ mod words_proximity;
|
||||
mod sum_of_words_attribute;
|
||||
mod sum_of_words_position;
|
||||
mod exact;
|
||||
mod sort_by_attr;
|
||||
// mod sort_by_attr;
|
||||
mod document_id;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use crate::rank::RawDocument;
|
||||
use crate::RawDocument;
|
||||
|
||||
pub use self::{
|
||||
sum_of_typos::SumOfTypos,
|
||||
@ -17,7 +17,7 @@ pub use self::{
|
||||
sum_of_words_attribute::SumOfWordsAttribute,
|
||||
sum_of_words_position::SumOfWordsPosition,
|
||||
exact::Exact,
|
||||
sort_by_attr::SortByAttr,
|
||||
// sort_by_attr::SortByAttr,
|
||||
document_id::DocumentId,
|
||||
};
|
||||
|
@ -1,9 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::rank::RawDocument;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn number_of_query_words(query_index: &[u32]) -> usize {
|
@ -3,9 +3,9 @@ use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
use crate::database::schema::{Schema, SchemaAttr};
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::database::RankedMap;
|
||||
use crate::rank::RawDocument;
|
||||
use crate::RawDocument;
|
||||
|
||||
/// An helper struct that permit to sort documents by
|
||||
/// some of their stored attributes.
|
@ -2,8 +2,8 @@ use std::cmp::Ordering;
|
||||
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::rank::RawDocument;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
// This function is a wrong logarithmic 10 function.
|
||||
// It is safe to panic on input number higher than 3,
|
@ -1,9 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::rank::RawDocument;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
|
@ -1,9 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::rank::RawDocument;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
|
@ -1,9 +1,7 @@
|
||||
use std::cmp::{self, Ordering};
|
||||
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::rank::RawDocument;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
const MAX_DISTANCE: u16 = 8;
|
||||
|
@ -1,16 +1,118 @@
|
||||
pub mod criterion;
|
||||
pub mod data;
|
||||
mod index;
|
||||
mod automaton;
|
||||
mod query_builder;
|
||||
mod distinct_map;
|
||||
|
||||
pub mod shared_data_cursor;
|
||||
pub mod write_to_bytes;
|
||||
|
||||
use std::sync::Arc;
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
|
||||
use slice_group_by::GroupBy;
|
||||
use rayon::slice::ParallelSliceMut;
|
||||
|
||||
use crate::{Match, DocumentId};
|
||||
|
||||
pub use self::index::{Index, IndexBuilder};
|
||||
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
|
||||
|
||||
/// Represent an internally generated document unique identifier.
|
||||
///
|
||||
/// It is used to inform the database the document you want to deserialize.
|
||||
/// Helpful for custom ranking.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
pub struct DocumentId(pub u64);
|
||||
|
||||
/// This structure represent the position of a word
|
||||
/// in a document and its attributes.
|
||||
///
|
||||
/// This is stored in the map, generated at index time,
|
||||
/// extracted and interpreted at search time.
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
#[repr(C)]
|
||||
pub struct DocIndex {
|
||||
/// The document identifier where the word was found.
|
||||
pub document_id: DocumentId,
|
||||
|
||||
/// The attribute in the document where the word was found
|
||||
/// along with the index in it.
|
||||
pub attribute: u16,
|
||||
pub word_index: u16,
|
||||
|
||||
/// The position in bytes where the word was found
|
||||
/// along with the length of it.
|
||||
///
|
||||
/// It informs on the original word area in the text indexed
|
||||
/// without needing to run the tokenizer again.
|
||||
pub char_index: u16,
|
||||
pub char_length: u16,
|
||||
}
|
||||
|
||||
/// This structure represent a matching word with informations
|
||||
/// on the location of the word in the document.
|
||||
///
|
||||
/// The order of the field is important because it defines
|
||||
/// the way these structures are ordered between themselves.
|
||||
///
|
||||
/// The word in itself is not important.
|
||||
// TODO do data oriented programming ? very arrays ?
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Match {
|
||||
/// The word index in the query sentence.
|
||||
/// Same as the `attribute_index` but for the query words.
|
||||
///
|
||||
/// Used to retrieve the automaton that match this word.
|
||||
pub query_index: u32,
|
||||
|
||||
/// The distance the word has with the query word
|
||||
/// (i.e. the Levenshtein distance).
|
||||
pub distance: u8,
|
||||
|
||||
/// The attribute in the document where the word was found
|
||||
/// along with the index in it.
|
||||
pub attribute: u16,
|
||||
pub word_index: u16,
|
||||
|
||||
/// Whether the word that match is an exact match or a prefix.
|
||||
pub is_exact: bool,
|
||||
|
||||
/// The position in bytes where the word was found
|
||||
/// along with the length of it.
|
||||
///
|
||||
/// It informs on the original word area in the text indexed
|
||||
/// without needing to run the tokenizer again.
|
||||
pub char_index: u16,
|
||||
pub char_length: u16,
|
||||
}
|
||||
|
||||
impl Match {
|
||||
pub fn zero() -> Self {
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: 0,
|
||||
word_index: 0,
|
||||
is_exact: false,
|
||||
char_index: 0,
|
||||
char_length: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max() -> Self {
|
||||
Match {
|
||||
query_index: u32::max_value(),
|
||||
distance: u8::max_value(),
|
||||
attribute: u16::max_value(),
|
||||
word_index: u16::max_value(),
|
||||
is_exact: true,
|
||||
char_index: u16::max_value(),
|
||||
char_length: u16::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Document {
|
||||
pub id: DocumentId,
|
||||
@ -181,3 +283,15 @@ impl Matches {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::mem;
|
||||
|
||||
#[test]
|
||||
fn docindex_mem_size() {
|
||||
assert_eq!(mem::size_of::<DocIndex>(), 24);
|
||||
}
|
||||
}
|
@ -11,11 +11,23 @@ use fst::Streamer;
|
||||
use log::info;
|
||||
|
||||
use crate::automaton::{self, DfaExt, AutomatonExt};
|
||||
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
|
||||
use crate::rank::criterion::Criteria;
|
||||
use crate::database::Index;
|
||||
use crate::rank::{raw_documents_from_matches, RawDocument, Document};
|
||||
use crate::{is_cjk, Match, DocumentId};
|
||||
use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
|
||||
use crate::criterion::Criteria;
|
||||
use crate::{raw_documents_from_matches, RawDocument, Document};
|
||||
use crate::{Index, Match, DocumentId};
|
||||
|
||||
// query splitting must move out of this crate
|
||||
pub fn is_cjk(c: char) -> bool {
|
||||
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
||||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
||||
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
||||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
||||
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
||||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
||||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
||||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
||||
(c >= '\u{f900}' && c <= '\u{faff}')
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
enum CharCategory {
|
1072
meilidb/Cargo.lock
generated
Normal file
1072
meilidb/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
50
meilidb/Cargo.toml
Normal file
50
meilidb/Cargo.toml
Normal file
@ -0,0 +1,50 @@
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "meilidb"
|
||||
version = "0.3.1"
|
||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
|
||||
[dependencies]
|
||||
arc-swap = "0.3.7"
|
||||
bincode = "1.1.2"
|
||||
byteorder = "1.3.1"
|
||||
fst = "0.3.3"
|
||||
hashbrown = { version = "0.1.8", features = ["serde"] }
|
||||
linked-hash-map = { version = "0.5.1", features = ["serde_impl"] }
|
||||
lockfree = "0.5.1"
|
||||
log = "0.4.6"
|
||||
sdset = "0.3.1"
|
||||
serde = "1.0.88"
|
||||
serde_derive = "1.0.88"
|
||||
serde_json = { version = "1.0.38", features = ["preserve_order"] }
|
||||
size_format = "1.0.2"
|
||||
slice-group-by = "0.2.4"
|
||||
unidecode = "0.3.0"
|
||||
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
|
||||
|
||||
[dependencies.toml]
|
||||
git = "https://github.com/Kerollmops/toml-rs.git"
|
||||
features = ["preserve_order"]
|
||||
rev = "0372ba6"
|
||||
|
||||
[dependencies.rocksdb]
|
||||
git = "https://github.com/pingcap/rust-rocksdb.git"
|
||||
rev = "306e201"
|
||||
|
||||
[features]
|
||||
default = ["simd"]
|
||||
i128 = ["bincode/i128"]
|
||||
portable = ["rocksdb/portable"]
|
||||
simd = ["rocksdb/sse"]
|
||||
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]
|
||||
|
||||
[dev-dependencies]
|
||||
csv = "1.0.5"
|
||||
env_logger = "0.6.0"
|
||||
jemallocator = "0.1.9"
|
||||
quickcheck = "0.8.2"
|
||||
rand = "0.6.5"
|
||||
rand_xorshift = "0.1.1"
|
||||
structopt = "0.2.14"
|
||||
tempfile = "3.0.7"
|
||||
termcolor = "1.0.4"
|
@ -5,7 +5,7 @@ use std::fmt;
|
||||
use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt};
|
||||
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::DocumentId;
|
||||
use meilidb_core::DocumentId;
|
||||
|
||||
const DOC_KEY_LEN: usize = 4 + size_of::<u64>();
|
||||
const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u16>();
|
@ -17,9 +17,9 @@ use hashbrown::HashMap;
|
||||
use log::{info, error, warn};
|
||||
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::shared_data_cursor::FromSharedDataCursor;
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::DocumentId;
|
||||
use meilidb_core::shared_data_cursor::FromSharedDataCursor;
|
||||
use meilidb_core::write_to_bytes::WriteToBytes;
|
||||
use meilidb_core::{Index, DocumentId};
|
||||
|
||||
use self::update::{ReadIndexEvent, ReadRankedMapEvent};
|
||||
|
||||
@ -29,7 +29,6 @@ pub use self::view::{DatabaseView, DocumentIter};
|
||||
pub use self::update::Update;
|
||||
pub use self::serde::SerializerError;
|
||||
pub use self::schema::Schema;
|
||||
pub use self::index::Index;
|
||||
pub use self::number::{Number, ParseNumberError};
|
||||
|
||||
pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>;
|
||||
@ -41,7 +40,6 @@ const CONFIG: &[u8] = b"config";
|
||||
|
||||
pub mod config;
|
||||
pub mod schema;
|
||||
pub(crate) mod index;
|
||||
mod number;
|
||||
mod document_key;
|
||||
mod serde;
|
@ -10,7 +10,7 @@ use linked_hash_map::LinkedHashMap;
|
||||
|
||||
use crate::database::serde::find_id::FindDocumentIdSerializer;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::DocumentId;
|
||||
use meilidb_core::DocumentId;
|
||||
|
||||
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false };
|
||||
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false };
|
@ -10,7 +10,7 @@ use serde::de::{self, Visitor, IntoDeserializer};
|
||||
|
||||
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
||||
use crate::database::schema::Schema;
|
||||
use crate::DocumentId;
|
||||
use meilidb_core::DocumentId;
|
||||
|
||||
pub struct Deserializer<'a, D>
|
||||
where D: Deref<Target=DB>
|
@ -3,7 +3,7 @@ use serde::ser;
|
||||
|
||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||
use crate::database::serde::{SerializerError, calculate_hash};
|
||||
use crate::DocumentId;
|
||||
use meilidb_core::DocumentId;
|
||||
|
||||
pub struct FindDocumentIdSerializer<'a> {
|
||||
pub id_attribute_name: &'a str,
|
@ -2,13 +2,14 @@ use std::collections::HashSet;
|
||||
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
use meilidb_core::{DocumentId, DocIndex};
|
||||
|
||||
use crate::database::update::DocumentUpdate;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::tokenizer::Token;
|
||||
use crate::{is_cjk, DocumentId, DocIndex};
|
||||
use crate::is_cjk;
|
||||
|
||||
pub struct IndexerSerializer<'a, 'b, B> {
|
||||
pub tokenizer_builder: &'a B,
|
@ -10,7 +10,7 @@ use crate::database::update::DocumentUpdate;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::schema::Schema;
|
||||
use crate::DocumentId;
|
||||
use meilidb_core::DocumentId;
|
||||
|
||||
pub struct Serializer<'a, 'b, B> {
|
||||
pub schema: &'a Schema,
|
@ -1,11 +1,11 @@
|
||||
use std::error::Error;
|
||||
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||
use meilidb_core::write_to_bytes::WriteToBytes;
|
||||
use meilidb_core::data::DocIds;
|
||||
|
||||
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::database::Index;
|
||||
use crate::data::DocIds;
|
||||
|
||||
pub enum WriteIndexEvent<'a> {
|
||||
RemovedDocuments(&'a DocIds),
|
@ -5,19 +5,18 @@ use rocksdb::rocksdb::{Writable, WriteBatch};
|
||||
use hashbrown::hash_map::HashMap;
|
||||
use sdset::{Set, SetBuf};
|
||||
use serde::Serialize;
|
||||
use meilidb_core::write_to_bytes::WriteToBytes;
|
||||
use meilidb_core::data::DocIds;
|
||||
use meilidb_core::{IndexBuilder, DocumentId, DocIndex};
|
||||
|
||||
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
||||
use crate::database::serde::serializer::Serializer;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::database::schema::Schema;
|
||||
use crate::database::index::IndexBuilder;
|
||||
use crate::database::{DATA_INDEX, DATA_RANKED_MAP};
|
||||
use crate::database::{RankedMap, Number};
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::data::DocIds;
|
||||
use crate::{DocumentId, DocIndex};
|
||||
|
||||
pub use self::index_event::{ReadIndexEvent, WriteIndexEvent};
|
||||
pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent};
|
@ -1,11 +1,11 @@
|
||||
use std::error::Error;
|
||||
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||
use meilidb_core::write_to_bytes::WriteToBytes;
|
||||
use meilidb_core::data::DocIds;
|
||||
|
||||
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::database::RankedMap;
|
||||
use crate::data::DocIds;
|
||||
|
||||
pub enum WriteRankedMapEvent<'a> {
|
||||
RemovedDocuments(&'a DocIds),
|
@ -6,16 +6,15 @@ use std::{fmt, marker};
|
||||
use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions};
|
||||
use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter};
|
||||
use serde::de::DeserializeOwned;
|
||||
use meilidb_core::{Index, QueryBuilder, FilterFunc};
|
||||
use meilidb_core::DocumentId;
|
||||
|
||||
use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map, retrieve_config};
|
||||
use crate::database::serde::deserializer::Deserializer;
|
||||
use crate::database::{DocumentKey, DocumentKeyAttr};
|
||||
use crate::rank::{QueryBuilder, FilterFunc};
|
||||
use crate::database::schema::Schema;
|
||||
use crate::database::index::Index;
|
||||
use crate::database::RankedMap;
|
||||
use crate::database::Config;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct DatabaseView<D>
|
||||
where D: Deref<Target=DB>
|
22
meilidb/src/lib.rs
Normal file
22
meilidb/src/lib.rs
Normal file
@ -0,0 +1,22 @@
|
||||
#![cfg_attr(feature = "nightly", feature(test))]
|
||||
|
||||
pub mod database;
|
||||
pub mod tokenizer;
|
||||
mod common_words;
|
||||
|
||||
pub use rocksdb;
|
||||
|
||||
pub use self::tokenizer::Tokenizer;
|
||||
pub use self::common_words::CommonWords;
|
||||
|
||||
pub fn is_cjk(c: char) -> bool {
|
||||
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
||||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
||||
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
||||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
||||
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
||||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
||||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
||||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
||||
(c >= '\u{f900}' && c <= '\u{faff}')
|
||||
}
|
136
src/lib.rs
136
src/lib.rs
@ -1,136 +0,0 @@
|
||||
#![cfg_attr(feature = "nightly", feature(test))]
|
||||
|
||||
pub mod automaton;
|
||||
pub mod database;
|
||||
pub mod data;
|
||||
pub mod rank;
|
||||
pub mod tokenizer;
|
||||
mod common_words;
|
||||
mod shared_data_cursor;
|
||||
mod write_to_bytes;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
|
||||
pub use rocksdb;
|
||||
|
||||
pub use self::tokenizer::Tokenizer;
|
||||
pub use self::common_words::CommonWords;
|
||||
|
||||
pub fn is_cjk(c: char) -> bool {
|
||||
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
||||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
||||
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
||||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
||||
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
||||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
||||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
||||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
||||
(c >= '\u{f900}' && c <= '\u{faff}')
|
||||
}
|
||||
|
||||
/// Represent an internally generated document unique identifier.
|
||||
///
|
||||
/// It is used to inform the database the document you want to deserialize.
|
||||
/// Helpful for custom ranking.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
pub struct DocumentId(u64);
|
||||
|
||||
/// This structure represent the position of a word
|
||||
/// in a document and its attributes.
|
||||
///
|
||||
/// This is stored in the map, generated at index time,
|
||||
/// extracted and interpreted at search time.
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
#[repr(C)]
|
||||
pub struct DocIndex {
|
||||
/// The document identifier where the word was found.
|
||||
pub document_id: DocumentId,
|
||||
|
||||
/// The attribute in the document where the word was found
|
||||
/// along with the index in it.
|
||||
pub attribute: u16,
|
||||
pub word_index: u16,
|
||||
|
||||
/// The position in bytes where the word was found
|
||||
/// along with the length of it.
|
||||
///
|
||||
/// It informs on the original word area in the text indexed
|
||||
/// without needing to run the tokenizer again.
|
||||
pub char_index: u16,
|
||||
pub char_length: u16,
|
||||
}
|
||||
|
||||
/// This structure represent a matching word with informations
|
||||
/// on the location of the word in the document.
|
||||
///
|
||||
/// The order of the field is important because it defines
|
||||
/// the way these structures are ordered between themselves.
|
||||
///
|
||||
/// The word in itself is not important.
|
||||
// TODO do data oriented programming ? very arrays ?
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Match {
|
||||
/// The word index in the query sentence.
|
||||
/// Same as the `attribute_index` but for the query words.
|
||||
///
|
||||
/// Used to retrieve the automaton that match this word.
|
||||
pub query_index: u32,
|
||||
|
||||
/// The distance the word has with the query word
|
||||
/// (i.e. the Levenshtein distance).
|
||||
pub distance: u8,
|
||||
|
||||
/// The attribute in the document where the word was found
|
||||
/// along with the index in it.
|
||||
pub attribute: u16,
|
||||
pub word_index: u16,
|
||||
|
||||
/// Whether the word that match is an exact match or a prefix.
|
||||
pub is_exact: bool,
|
||||
|
||||
/// The position in bytes where the word was found
|
||||
/// along with the length of it.
|
||||
///
|
||||
/// It informs on the original word area in the text indexed
|
||||
/// without needing to run the tokenizer again.
|
||||
pub char_index: u16,
|
||||
pub char_length: u16,
|
||||
}
|
||||
|
||||
impl Match {
|
||||
pub fn zero() -> Self {
|
||||
Match {
|
||||
query_index: 0,
|
||||
distance: 0,
|
||||
attribute: 0,
|
||||
word_index: 0,
|
||||
is_exact: false,
|
||||
char_index: 0,
|
||||
char_length: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max() -> Self {
|
||||
Match {
|
||||
query_index: u32::max_value(),
|
||||
distance: u8::max_value(),
|
||||
attribute: u16::max_value(),
|
||||
word_index: u16::max_value(),
|
||||
is_exact: true,
|
||||
char_index: u16::max_value(),
|
||||
char_length: u16::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::mem;
|
||||
|
||||
#[test]
|
||||
fn docindex_mem_size() {
|
||||
assert_eq!(mem::size_of::<DocIndex>(), 16);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user