MeiliSearch/meilidb-data/src/database.rs

445 lines
12 KiB
Rust
Raw Normal View History

2019-04-18 13:58:35 +02:00
use std::collections::HashSet;
use std::io::{self, Cursor, BufRead};
use std::iter::FromIterator;
2019-04-08 15:19:57 +02:00
use std::path::Path;
use std::sync::Arc;
2019-04-08 15:19:57 +02:00
use arc_swap::{ArcSwap, Lease};
use byteorder::{ReadBytesExt, BigEndian};
use hashbrown::HashMap;
use meilidb_core::criterion::Criteria;
use meilidb_core::QueryBuilder;
use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor};
use meilidb_core::write_to_bytes::WriteToBytes;
use meilidb_core::{DocumentId, Index as WordIndex};
2019-04-18 13:58:35 +02:00
use rmp_serde::decode::{Error as RmpError};
use sdset::SetBuf;
use serde::de;
use sled::IVec;
use crate::{Schema, SchemaAttr, RankedMap};
use crate::serde::{extract_document_id, Serializer, Deserializer, SerializerError};
use crate::indexer::Indexer;
2019-04-08 15:19:57 +02:00
#[derive(Debug)]
pub enum Error {
SchemaDiffer,
SchemaMissing,
WordIndexMissing,
MissingDocumentId,
2019-04-08 15:19:57 +02:00
SledError(sled::Error),
BincodeError(bincode::Error),
SerializerError(SerializerError),
2019-04-08 15:19:57 +02:00
}
impl From<sled::Error> for Error {
fn from(error: sled::Error) -> Error {
Error::SledError(error)
}
}
impl From<bincode::Error> for Error {
fn from(error: bincode::Error) -> Error {
Error::BincodeError(error)
}
}
impl From<SerializerError> for Error {
fn from(error: SerializerError) -> Error {
Error::SerializerError(error)
}
}
2019-04-08 15:19:57 +02:00
fn index_name(name: &str) -> Vec<u8> {
format!("index-{}", name).into_bytes()
}
fn document_key(id: DocumentId, attr: SchemaAttr) -> Vec<u8> {
let DocumentId(document_id) = id;
let SchemaAttr(schema_attr) = attr;
let mut bytes = Vec::new();
bytes.extend_from_slice(b"document-");
bytes.extend_from_slice(&document_id.to_be_bytes()[..]);
bytes.extend_from_slice(&schema_attr.to_be_bytes()[..]);
bytes
}
2019-04-18 13:58:35 +02:00
trait CursorExt {
fn consume_if_eq(&mut self, needle: &[u8]) -> bool;
}
impl<T: AsRef<[u8]>> CursorExt for Cursor<T> {
fn consume_if_eq(&mut self, needle: &[u8]) -> bool {
let position = self.position() as usize;
let slice = self.get_ref().as_ref();
if slice[position..].starts_with(needle) {
self.consume(needle.len());
true
} else {
false
}
}
}
fn extract_document_key(key: Vec<u8>) -> io::Result<(DocumentId, SchemaAttr)> {
let mut key = Cursor::new(key);
if !key.consume_if_eq(b"document-") {
return Err(io::Error::from(io::ErrorKind::InvalidData))
}
let document_id = key.read_u64::<BigEndian>().map(DocumentId)?;
let schema_attr = key.read_u16::<BigEndian>().map(SchemaAttr)?;
Ok((document_id, schema_attr))
}
fn ivec_into_arc(ivec: IVec) -> Arc<[u8]> {
match ivec {
IVec::Inline(len, bytes) => Arc::from(&bytes[..len as usize]),
IVec::Remote { buf } => buf,
}
}
#[derive(Clone)]
pub struct Database {
2019-04-18 13:58:35 +02:00
opened: Arc<ArcSwap<HashMap<String, RawIndex>>>,
inner: sled::Db,
}
impl Database {
2019-04-08 15:19:57 +02:00
pub fn start_default<P: AsRef<Path>>(path: P) -> Result<Database, Error> {
let inner = sled::Db::start_default(path)?;
let opened = Arc::new(ArcSwap::new(Arc::new(HashMap::new())));
Ok(Database { opened, inner })
}
2019-04-08 15:19:57 +02:00
pub fn open_index(&self, name: &str) -> Result<Option<Index>, Error> {
// check if the index was already opened
2019-04-18 13:58:35 +02:00
if let Some(raw_index) = self.opened.lease().get(name) {
return Ok(Some(Index(raw_index.clone())))
}
let raw_name = index_name(name);
if self.inner.tree_names().into_iter().any(|tn| tn == raw_name) {
let tree = self.inner.open_tree(raw_name)?;
2019-04-18 13:58:35 +02:00
let raw_index = RawIndex::from_raw(tree)?;
self.opened.rcu(|opened| {
let mut opened = HashMap::clone(opened);
2019-04-18 13:58:35 +02:00
opened.insert(name.to_string(), raw_index.clone());
opened
});
2019-04-18 13:58:35 +02:00
return Ok(Some(Index(raw_index)))
2019-04-08 15:19:57 +02:00
}
Ok(None)
}
pub fn create_index(&self, name: String, schema: Schema) -> Result<Index, Error> {
match self.open_index(&name)? {
2019-04-08 15:19:57 +02:00
Some(index) => {
2019-04-18 13:58:35 +02:00
if index.schema() != &schema {
return Err(Error::SchemaDiffer);
}
2019-04-08 15:19:57 +02:00
Ok(index)
},
None => {
let raw_name = index_name(&name);
let tree = self.inner.open_tree(raw_name)?;
2019-04-18 13:58:35 +02:00
let raw_index = RawIndex::new_from_raw(tree, schema)?;
self.opened.rcu(|opened| {
let mut opened = HashMap::clone(opened);
2019-04-18 13:58:35 +02:00
opened.insert(name.clone(), raw_index.clone());
opened
});
2019-04-18 13:58:35 +02:00
Ok(Index(raw_index))
2019-04-08 15:19:57 +02:00
},
}
}
}
#[derive(Clone)]
2019-04-18 13:58:35 +02:00
pub struct RawIndex {
2019-04-08 15:19:57 +02:00
schema: Schema,
word_index: Arc<ArcSwap<WordIndex>>,
ranked_map: Arc<ArcSwap<RankedMap>>,
2019-04-08 15:19:57 +02:00
inner: Arc<sled::Tree>,
}
2019-04-18 13:58:35 +02:00
impl RawIndex {
fn from_raw(inner: Arc<sled::Tree>) -> Result<RawIndex, Error> {
let schema = {
let bytes = inner.get("schema")?;
let bytes = bytes.ok_or(Error::SchemaMissing)?;
Schema::read_from_bin(bytes.as_ref())?
};
let bytes = inner.get("word-index")?;
let bytes = bytes.ok_or(Error::WordIndexMissing)?;
let word_index = {
let len = bytes.len();
let bytes = ivec_into_arc(bytes);
let mut cursor = SharedDataCursor::from_shared_bytes(bytes, 0, len);
// TODO must handle this error
let word_index = WordIndex::from_shared_data_cursor(&mut cursor).unwrap();
Arc::new(ArcSwap::new(Arc::new(word_index)))
};
let ranked_map = {
let map = match inner.get("ranked-map")? {
Some(bytes) => bincode::deserialize(bytes.as_ref())?,
None => RankedMap::default(),
};
Arc::new(ArcSwap::new(Arc::new(map)))
};
2019-04-18 13:58:35 +02:00
Ok(RawIndex { schema, word_index, ranked_map, inner })
2019-04-08 15:19:57 +02:00
}
2019-04-18 13:58:35 +02:00
fn new_from_raw(inner: Arc<sled::Tree>, schema: Schema) -> Result<RawIndex, Error> {
2019-04-08 15:19:57 +02:00
let mut schema_bytes = Vec::new();
schema.write_to_bin(&mut schema_bytes)?;
2019-04-08 15:19:57 +02:00
inner.set("schema", schema_bytes)?;
let word_index = WordIndex::default();
inner.set("word-index", word_index.into_bytes())?;
let word_index = Arc::new(ArcSwap::new(Arc::new(word_index)));
let ranked_map = Arc::new(ArcSwap::new(Arc::new(RankedMap::default())));
2019-04-18 13:58:35 +02:00
Ok(RawIndex { schema, word_index, ranked_map, inner })
2019-04-08 15:19:57 +02:00
}
pub fn schema(&self) -> &Schema {
&self.schema
}
pub fn word_index(&self) -> Lease<Arc<WordIndex>> {
self.word_index.lease()
}
pub fn ranked_map(&self) -> Lease<Arc<RankedMap>> {
self.ranked_map.lease()
}
pub fn update_word_index(&self, word_index: Arc<WordIndex>) -> sled::Result<()> {
let data = word_index.into_bytes();
self.inner.set("word-index", data).map(drop)?;
self.word_index.store(word_index);
Ok(())
}
2019-04-18 13:58:35 +02:00
pub fn update_ranked_map(&self, ranked_map: Arc<RankedMap>) {
self.ranked_map.store(ranked_map)
}
pub fn set_document_attribute<V>(
&self,
id: DocumentId,
attr: SchemaAttr,
value: V,
) -> Result<Option<IVec>, sled::Error>
where IVec: From<V>,
{
let key = document_key(id, attr);
Ok(self.inner.set(key, value)?)
}
pub fn get_document_attribute(
&self,
id: DocumentId,
attr: SchemaAttr
) -> Result<Option<IVec>, sled::Error>
{
let key = document_key(id, attr);
Ok(self.inner.get(key)?)
}
2019-04-18 13:58:35 +02:00
pub fn get_document_fields(&self, id: DocumentId) -> DocumentFieldsIter {
let start = document_key(id, SchemaAttr::min());
let end = document_key(id, SchemaAttr::max());
DocumentFieldsIter(self.inner.range(start..=end))
}
pub fn del_document_attribute(
&self,
id: DocumentId,
attr: SchemaAttr
) -> Result<Option<IVec>, sled::Error>
{
let key = document_key(id, attr);
Ok(self.inner.del(key)?)
}
2019-04-08 15:19:57 +02:00
}
2019-04-18 13:58:35 +02:00
pub struct DocumentFieldsIter<'a>(sled::Iter<'a>);
impl<'a> Iterator for DocumentFieldsIter<'a> {
type Item = Result<(DocumentId, SchemaAttr, IVec), Error>;
fn next(&mut self) -> Option<Self::Item> {
match self.0.next() {
Some(Ok((key, value))) => {
let (id, attr) = extract_document_key(key).unwrap();
Some(Ok((id, attr, value)))
},
Some(Err(e)) => Some(Err(Error::SledError(e))),
None => None,
}
}
}
#[derive(Clone)]
pub struct Index(RawIndex);
impl Index {
pub fn query_builder(&self) -> QueryBuilder<Lease<Arc<WordIndex>>> {
let word_index = self.word_index();
QueryBuilder::new(word_index)
}
pub fn query_builder_with_criteria<'c>(
&self,
criteria: Criteria<'c>,
) -> QueryBuilder<'c, Lease<Arc<WordIndex>>>
{
let word_index = self.word_index();
QueryBuilder::with_criteria(word_index, criteria)
}
2019-04-18 13:58:35 +02:00
pub fn schema(&self) -> &Schema {
self.0.schema()
}
pub fn word_index(&self) -> Lease<Arc<WordIndex>> {
self.0.word_index()
}
pub fn ranked_map(&self) -> Lease<Arc<RankedMap>> {
self.0.ranked_map()
}
pub fn documents_addition(&self) -> DocumentsAddition {
let index = self.0.clone();
DocumentsAddition::from_raw(index)
}
pub fn documents_deletion(&self) -> DocumentsDeletion {
let index = self.0.clone();
DocumentsDeletion::from_raw(index)
}
2019-04-18 13:58:35 +02:00
pub fn document<T>(
&self,
fields: Option<&HashSet<&str>>,
id: DocumentId,
) -> Result<Option<T>, RmpError>
where T: de::DeserializeOwned,
{
let fields = match fields {
Some(fields) => {
let iter = fields.iter().filter_map(|n| self.0.schema().attribute(n));
Some(HashSet::from_iter(iter))
},
None => None,
};
let mut deserializer = Deserializer {
document_id: id,
raw_index: &self.0,
fields: fields.as_ref(),
};
// TODO: currently we return an error if all document fields are missing,
// returning None would have been better
T::deserialize(&mut deserializer).map(Some)
}
}
pub struct DocumentsAddition {
inner: RawIndex,
indexer: Indexer,
}
impl DocumentsAddition {
pub fn from_raw(inner: RawIndex) -> DocumentsAddition {
DocumentsAddition { inner, indexer: Indexer::new() }
}
pub fn update_document<D>(&mut self, document: D) -> Result<(), Error>
where D: serde::Serialize,
{
let schema = self.inner.schema();
let identifier = schema.identifier_name();
let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
let serializer = Serializer {
schema,
index: &self.inner,
indexer: &mut self.indexer,
document_id,
};
document.serialize(serializer)?;
Ok(())
}
pub fn finalize(self) -> sled::Result<()> {
let delta_index = self.indexer.build();
let index = self.inner.word_index();
let new_index = index.r#union(&delta_index);
let new_index = Arc::from(new_index);
self.inner.update_word_index(new_index)?;
Ok(())
}
}
pub struct DocumentsDeletion {
inner: RawIndex,
documents: Vec<DocumentId>,
}
impl DocumentsDeletion {
pub fn from_raw(inner: RawIndex) -> DocumentsDeletion {
DocumentsDeletion { inner, documents: Vec::new() }
}
pub fn delete_document(&mut self, id: DocumentId) {
self.documents.push(id);
}
pub fn finalize(mut self) -> Result<(), Error> {
self.documents.sort_unstable();
self.documents.dedup();
let idset = SetBuf::new_unchecked(self.documents);
let index = self.inner.word_index();
let new_index = index.remove_documents(&idset);
let new_index = Arc::from(new_index);
self.inner.update_word_index(new_index)?;
Ok(())
}
}