mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
feat: Introduce the Index struct
This commit is contained in:
parent
ee2bad20c7
commit
4b40d5b0d4
@ -6,7 +6,7 @@ edition = "2018"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
byteorder = "1.3.1"
|
byteorder = "1.3.1"
|
||||||
hashbrown = "0.1.8"
|
hashbrown = "0.2.2"
|
||||||
lazy_static = "1.2.0"
|
lazy_static = "1.2.0"
|
||||||
log = "0.4.6"
|
log = "0.4.6"
|
||||||
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
|
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
|
||||||
|
@ -8,13 +8,17 @@ edition = "2018"
|
|||||||
arc-swap = "0.3.11"
|
arc-swap = "0.3.11"
|
||||||
bincode = "1.1.2"
|
bincode = "1.1.2"
|
||||||
byteorder = "1.3.1"
|
byteorder = "1.3.1"
|
||||||
hashbrown = { version = "0.1.8", features = ["serde"] }
|
hashbrown = { version = "0.2.2", features = ["serde"] }
|
||||||
linked-hash-map = { version = "0.5.2", features = ["serde_impl"] }
|
linked-hash-map = { version = "0.5.2", features = ["serde_impl"] }
|
||||||
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
|
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
|
||||||
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
|
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
|
||||||
ordered-float = { version = "1.0.2", features = ["serde"] }
|
ordered-float = { version = "1.0.2", features = ["serde"] }
|
||||||
sdset = "0.3.1"
|
sdset = "0.3.1"
|
||||||
serde = { version = "1.0.88", features = ["derive"] }
|
serde = { version = "1.0.90", features = ["derive"] }
|
||||||
serde_json = { version = "1.0.39", features = ["preserve_order"] }
|
serde_json = { version = "1.0.39", features = ["preserve_order"] }
|
||||||
sled = "0.22.1"
|
sled = "0.22.1"
|
||||||
toml = { version = "0.5.0", features = ["preserve_order"] }
|
toml = { version = "0.5.0", features = ["preserve_order"] }
|
||||||
|
|
||||||
|
[dependencies.rmp-serde]
|
||||||
|
git = "https://github.com/3Hren/msgpack-rust.git"
|
||||||
|
rev = "40b3d48"
|
||||||
|
@ -1,3 +1,6 @@
|
|||||||
|
use std::collections::HashSet;
|
||||||
|
use std::io::{self, Cursor, BufRead};
|
||||||
|
use std::iter::FromIterator;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
@ -6,7 +9,11 @@ use hashbrown::HashMap;
|
|||||||
use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor};
|
use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor};
|
||||||
use meilidb_core::write_to_bytes::WriteToBytes;
|
use meilidb_core::write_to_bytes::WriteToBytes;
|
||||||
use meilidb_core::{DocumentId, Index as WordIndex};
|
use meilidb_core::{DocumentId, Index as WordIndex};
|
||||||
|
use rmp_serde::decode::{Deserializer as RmpDeserializer, ReadReader};
|
||||||
|
use rmp_serde::decode::{Error as RmpError};
|
||||||
|
use serde::{de, forward_to_deserialize_any};
|
||||||
use sled::IVec;
|
use sled::IVec;
|
||||||
|
use byteorder::{ReadBytesExt, BigEndian};
|
||||||
|
|
||||||
use crate::{Schema, SchemaAttr, RankedMap};
|
use crate::{Schema, SchemaAttr, RankedMap};
|
||||||
|
|
||||||
@ -46,6 +53,37 @@ fn document_key(id: DocumentId, attr: SchemaAttr) -> Vec<u8> {
|
|||||||
bytes
|
bytes
|
||||||
}
|
}
|
||||||
|
|
||||||
|
trait CursorExt {
|
||||||
|
fn consume_if_eq(&mut self, needle: &[u8]) -> bool;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: AsRef<[u8]>> CursorExt for Cursor<T> {
|
||||||
|
fn consume_if_eq(&mut self, needle: &[u8]) -> bool {
|
||||||
|
let position = self.position() as usize;
|
||||||
|
let slice = self.get_ref().as_ref();
|
||||||
|
|
||||||
|
if slice[position..].starts_with(needle) {
|
||||||
|
self.consume(needle.len());
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_document_key(key: Vec<u8>) -> io::Result<(DocumentId, SchemaAttr)> {
|
||||||
|
let mut key = Cursor::new(key);
|
||||||
|
|
||||||
|
if !key.consume_if_eq(b"document-") {
|
||||||
|
return Err(io::Error::from(io::ErrorKind::InvalidData))
|
||||||
|
}
|
||||||
|
|
||||||
|
let document_id = key.read_u64::<BigEndian>().map(DocumentId)?;
|
||||||
|
let schema_attr = key.read_u16::<BigEndian>().map(SchemaAttr)?;
|
||||||
|
|
||||||
|
Ok((document_id, schema_attr))
|
||||||
|
}
|
||||||
|
|
||||||
fn ivec_into_arc(ivec: IVec) -> Arc<[u8]> {
|
fn ivec_into_arc(ivec: IVec) -> Arc<[u8]> {
|
||||||
match ivec {
|
match ivec {
|
||||||
IVec::Inline(len, bytes) => Arc::from(&bytes[..len as usize]),
|
IVec::Inline(len, bytes) => Arc::from(&bytes[..len as usize]),
|
||||||
@ -55,7 +93,7 @@ fn ivec_into_arc(ivec: IVec) -> Arc<[u8]> {
|
|||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Database {
|
pub struct Database {
|
||||||
opened: Arc<ArcSwap<HashMap<String, Index>>>,
|
opened: Arc<ArcSwap<HashMap<String, RawIndex>>>,
|
||||||
inner: sled::Db,
|
inner: sled::Db,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -68,22 +106,22 @@ impl Database {
|
|||||||
|
|
||||||
pub fn open_index(&self, name: &str) -> Result<Option<Index>, Error> {
|
pub fn open_index(&self, name: &str) -> Result<Option<Index>, Error> {
|
||||||
// check if the index was already opened
|
// check if the index was already opened
|
||||||
if let Some(index) = self.opened.lease().get(name) {
|
if let Some(raw_index) = self.opened.lease().get(name) {
|
||||||
return Ok(Some(index.clone()))
|
return Ok(Some(Index(raw_index.clone())))
|
||||||
}
|
}
|
||||||
|
|
||||||
let raw_name = index_name(name);
|
let raw_name = index_name(name);
|
||||||
if self.inner.tree_names().into_iter().any(|tn| tn == raw_name) {
|
if self.inner.tree_names().into_iter().any(|tn| tn == raw_name) {
|
||||||
let tree = self.inner.open_tree(raw_name)?;
|
let tree = self.inner.open_tree(raw_name)?;
|
||||||
let index = Index::from_raw(tree)?;
|
let raw_index = RawIndex::from_raw(tree)?;
|
||||||
|
|
||||||
self.opened.rcu(|opened| {
|
self.opened.rcu(|opened| {
|
||||||
let mut opened = HashMap::clone(opened);
|
let mut opened = HashMap::clone(opened);
|
||||||
opened.insert(name.to_string(), index.clone());
|
opened.insert(name.to_string(), raw_index.clone());
|
||||||
opened
|
opened
|
||||||
});
|
});
|
||||||
|
|
||||||
return Ok(Some(index))
|
return Ok(Some(Index(raw_index)))
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(None)
|
Ok(None)
|
||||||
@ -92,7 +130,7 @@ impl Database {
|
|||||||
pub fn create_index(&self, name: String, schema: Schema) -> Result<Index, Error> {
|
pub fn create_index(&self, name: String, schema: Schema) -> Result<Index, Error> {
|
||||||
match self.open_index(&name)? {
|
match self.open_index(&name)? {
|
||||||
Some(index) => {
|
Some(index) => {
|
||||||
if index.schema != schema {
|
if index.schema() != &schema {
|
||||||
return Err(Error::SchemaDiffer);
|
return Err(Error::SchemaDiffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -101,30 +139,30 @@ impl Database {
|
|||||||
None => {
|
None => {
|
||||||
let raw_name = index_name(&name);
|
let raw_name = index_name(&name);
|
||||||
let tree = self.inner.open_tree(raw_name)?;
|
let tree = self.inner.open_tree(raw_name)?;
|
||||||
let index = Index::new_from_raw(tree, schema)?;
|
let raw_index = RawIndex::new_from_raw(tree, schema)?;
|
||||||
|
|
||||||
self.opened.rcu(|opened| {
|
self.opened.rcu(|opened| {
|
||||||
let mut opened = HashMap::clone(opened);
|
let mut opened = HashMap::clone(opened);
|
||||||
opened.insert(name.clone(), index.clone());
|
opened.insert(name.clone(), raw_index.clone());
|
||||||
opened
|
opened
|
||||||
});
|
});
|
||||||
|
|
||||||
Ok(index)
|
Ok(Index(raw_index))
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Index {
|
pub struct RawIndex {
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
word_index: Arc<ArcSwap<WordIndex>>,
|
word_index: Arc<ArcSwap<WordIndex>>,
|
||||||
ranked_map: Arc<ArcSwap<RankedMap>>,
|
ranked_map: Arc<ArcSwap<RankedMap>>,
|
||||||
inner: Arc<sled::Tree>,
|
inner: Arc<sled::Tree>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Index {
|
impl RawIndex {
|
||||||
fn from_raw(inner: Arc<sled::Tree>) -> Result<Index, Error> {
|
fn from_raw(inner: Arc<sled::Tree>) -> Result<RawIndex, Error> {
|
||||||
let schema = {
|
let schema = {
|
||||||
let bytes = inner.get("schema")?;
|
let bytes = inner.get("schema")?;
|
||||||
let bytes = bytes.ok_or(Error::SchemaMissing)?;
|
let bytes = bytes.ok_or(Error::SchemaMissing)?;
|
||||||
@ -153,10 +191,10 @@ impl Index {
|
|||||||
Arc::new(ArcSwap::new(Arc::new(map)))
|
Arc::new(ArcSwap::new(Arc::new(map)))
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(Index { schema, word_index, ranked_map, inner })
|
Ok(RawIndex { schema, word_index, ranked_map, inner })
|
||||||
}
|
}
|
||||||
|
|
||||||
fn new_from_raw(inner: Arc<sled::Tree>, schema: Schema) -> Result<Index, Error> {
|
fn new_from_raw(inner: Arc<sled::Tree>, schema: Schema) -> Result<RawIndex, Error> {
|
||||||
let mut schema_bytes = Vec::new();
|
let mut schema_bytes = Vec::new();
|
||||||
schema.write_to_bin(&mut schema_bytes)?;
|
schema.write_to_bin(&mut schema_bytes)?;
|
||||||
inner.set("schema", schema_bytes)?;
|
inner.set("schema", schema_bytes)?;
|
||||||
@ -167,7 +205,7 @@ impl Index {
|
|||||||
|
|
||||||
let ranked_map = Arc::new(ArcSwap::new(Arc::new(RankedMap::default())));
|
let ranked_map = Arc::new(ArcSwap::new(Arc::new(RankedMap::default())));
|
||||||
|
|
||||||
Ok(Index { schema, word_index, ranked_map, inner })
|
Ok(RawIndex { schema, word_index, ranked_map, inner })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn schema(&self) -> &Schema {
|
pub fn schema(&self) -> &Schema {
|
||||||
@ -182,11 +220,11 @@ impl Index {
|
|||||||
self.ranked_map.lease()
|
self.ranked_map.lease()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn update_word_index(&self, word_index: Arc<WordIndex>) {
|
pub fn update_word_index(&self, word_index: Arc<WordIndex>) {
|
||||||
self.word_index.store(word_index)
|
self.word_index.store(word_index)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn update_ranked_map(&self, ranked_map: Arc<RankedMap>) {
|
pub fn update_ranked_map(&self, ranked_map: Arc<RankedMap>) {
|
||||||
self.ranked_map.store(ranked_map)
|
self.ranked_map.store(ranked_map)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -212,6 +250,12 @@ impl Index {
|
|||||||
Ok(self.inner.get(key)?)
|
Ok(self.inner.get(key)?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_document_fields(&self, id: DocumentId) -> DocumentFieldsIter {
|
||||||
|
let start = document_key(id, SchemaAttr::min());
|
||||||
|
let end = document_key(id, SchemaAttr::max());
|
||||||
|
DocumentFieldsIter(self.inner.range(start..=end))
|
||||||
|
}
|
||||||
|
|
||||||
pub fn del_document_attribute(
|
pub fn del_document_attribute(
|
||||||
&self,
|
&self,
|
||||||
id: DocumentId,
|
id: DocumentId,
|
||||||
@ -222,3 +266,150 @@ impl Index {
|
|||||||
Ok(self.inner.del(key)?)
|
Ok(self.inner.del(key)?)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct DocumentFieldsIter<'a>(sled::Iter<'a>);
|
||||||
|
|
||||||
|
impl<'a> Iterator for DocumentFieldsIter<'a> {
|
||||||
|
type Item = Result<(DocumentId, SchemaAttr, IVec), Error>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
match self.0.next() {
|
||||||
|
Some(Ok((key, value))) => {
|
||||||
|
let (id, attr) = extract_document_key(key).unwrap();
|
||||||
|
Some(Ok((id, attr, value)))
|
||||||
|
},
|
||||||
|
Some(Err(e)) => Some(Err(Error::SledError(e))),
|
||||||
|
None => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct Index(RawIndex);
|
||||||
|
|
||||||
|
impl Index {
|
||||||
|
pub fn schema(&self) -> &Schema {
|
||||||
|
self.0.schema()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn word_index(&self) -> Lease<Arc<WordIndex>> {
|
||||||
|
self.0.word_index()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn ranked_map(&self) -> Lease<Arc<RankedMap>> {
|
||||||
|
self.0.ranked_map()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn document<T>(
|
||||||
|
&self,
|
||||||
|
fields: Option<&HashSet<&str>>,
|
||||||
|
id: DocumentId,
|
||||||
|
) -> Result<Option<T>, RmpError>
|
||||||
|
where T: de::DeserializeOwned,
|
||||||
|
{
|
||||||
|
let fields = match fields {
|
||||||
|
Some(fields) => {
|
||||||
|
let iter = fields.iter().filter_map(|n| self.0.schema().attribute(n));
|
||||||
|
Some(HashSet::from_iter(iter))
|
||||||
|
},
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut deserializer = Deserializer {
|
||||||
|
document_id: id,
|
||||||
|
raw_index: &self.0,
|
||||||
|
fields: fields.as_ref(),
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: currently we return an error if all document fields are missing,
|
||||||
|
// returning None would have been better
|
||||||
|
T::deserialize(&mut deserializer).map(Some)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Deserializer<'a> {
|
||||||
|
document_id: DocumentId,
|
||||||
|
raw_index: &'a RawIndex,
|
||||||
|
fields: Option<&'a HashSet<SchemaAttr>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a>
|
||||||
|
{
|
||||||
|
type Error = RmpError;
|
||||||
|
|
||||||
|
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
|
||||||
|
where V: de::Visitor<'de>
|
||||||
|
{
|
||||||
|
self.deserialize_map(visitor)
|
||||||
|
}
|
||||||
|
|
||||||
|
forward_to_deserialize_any! {
|
||||||
|
bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit seq
|
||||||
|
bytes byte_buf unit_struct tuple_struct
|
||||||
|
identifier tuple ignored_any option newtype_struct enum struct
|
||||||
|
}
|
||||||
|
|
||||||
|
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
|
||||||
|
where V: de::Visitor<'de>
|
||||||
|
{
|
||||||
|
let document_attributes = self.raw_index.get_document_fields(self.document_id);
|
||||||
|
let document_attributes = document_attributes.filter_map(|result| {
|
||||||
|
match result {
|
||||||
|
Ok(value) => Some(value),
|
||||||
|
Err(e) => {
|
||||||
|
// TODO: must log the error
|
||||||
|
// error!("sled iter error; {}", e);
|
||||||
|
None
|
||||||
|
},
|
||||||
|
}
|
||||||
|
});
|
||||||
|
let iter = document_attributes.filter_map(|(_, attr, value)| {
|
||||||
|
if self.fields.map_or(true, |f| f.contains(&attr)) {
|
||||||
|
let attribute_name = self.raw_index.schema.attribute_name(attr);
|
||||||
|
Some((attribute_name, Value::new(value)))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let map_deserializer = de::value::MapDeserializer::new(iter);
|
||||||
|
visitor.visit_map(map_deserializer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Value<A>(RmpDeserializer<ReadReader<Cursor<A>>>) where A: AsRef<[u8]>;
|
||||||
|
|
||||||
|
impl<A> Value<A> where A: AsRef<[u8]>
|
||||||
|
{
|
||||||
|
fn new(value: A) -> Value<A> {
|
||||||
|
Value(RmpDeserializer::new(Cursor::new(value)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'de, A> de::IntoDeserializer<'de, RmpError> for Value<A>
|
||||||
|
where A: AsRef<[u8]>,
|
||||||
|
{
|
||||||
|
type Deserializer = Self;
|
||||||
|
|
||||||
|
fn into_deserializer(self) -> Self::Deserializer {
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'de, 'a, A> de::Deserializer<'de> for Value<A>
|
||||||
|
where A: AsRef<[u8]>,
|
||||||
|
{
|
||||||
|
type Error = RmpError;
|
||||||
|
|
||||||
|
fn deserialize_any<V>(mut self, visitor: V) -> Result<V::Value, Self::Error>
|
||||||
|
where V: de::Visitor<'de>
|
||||||
|
{
|
||||||
|
self.0.deserialize_any(visitor)
|
||||||
|
}
|
||||||
|
|
||||||
|
forward_to_deserialize_any! {
|
||||||
|
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
|
||||||
|
bytes byte_buf option unit unit_struct newtype_struct seq tuple
|
||||||
|
tuple_struct map struct enum identifier ignored_any
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user