feat: Introduce Tree wrappers for each index component

2025-05-25 09:03:59 +02:00 · 2019-05-06 14:13:09 +02:00 · 2019-05-06 14:13:09 +02:00 · 0c18026240
commit 0c18026240
parent 6eb25687f8
5 changed files with 186 additions and 26 deletions
--- a/meilidb-data/src/database.rs
+++ b/meilidb-data/src/database.rs
@ -1,4 +1,5 @@
 use std::collections::HashSet;
+use std::convert::TryInto;
 use std::io::{self, Cursor, BufRead};
 use std::iter::FromIterator;
 use std::path::Path;
@ -8,15 +9,17 @@ use std::{error, fmt};
 use arc_swap::{ArcSwap, Lease};
 use byteorder::{ReadBytesExt, BigEndian};
 use hashbrown::HashMap;
-use meilidb_core::{criterion::Criteria, QueryBuilder, DocumentId};
+use meilidb_core::{criterion::Criteria, QueryBuilder, DocumentId, DocIndex};
 use rmp_serde::decode::{Error as RmpError};
 use sdset::SetBuf;
 use serde::de;
 use sled::IVec;
+use zerocopy::{AsBytes, LayoutVerified};

 use crate::{Schema, SchemaAttr, RankedMap};
 use crate::serde::{extract_document_id, Serializer, Deserializer, SerializerError};
 use crate::indexer::{Indexer, WordIndexTree};
+use crate::document_attr_key::DocumentAttrKey;

 pub type WordIndex = meilidb_core::Index<WordIndexTree>;

@ -27,6 +30,7 @@ pub enum Error {
    WordIndexMissing,
    MissingDocumentId,
    SledError(sled::Error),
+    FstError(fst::Error),
    BincodeError(bincode::Error),
    SerializerError(SerializerError),
 }
@ -37,6 +41,12 @@ impl From<sled::Error> for Error {
    }
 }

+impl From<fst::Error> for Error {
+    fn from(error: fst::Error) -> Error {
+        Error::FstError(error)
+    }
+}
+
 impl From<bincode::Error> for Error {
    fn from(error: bincode::Error) -> Error {
        Error::BincodeError(error)
@ -58,6 +68,7 @@ impl fmt::Display for Error {
            WordIndexMissing => write!(f, "this index does not have a word index"),
            MissingDocumentId => write!(f, "document id is missing"),
            SledError(e) => write!(f, "sled error; {}", e),
+            FstError(e) => write!(f, "fst error; {}", e),
            BincodeError(e) => write!(f, "bincode error; {}", e),
            SerializerError(e) => write!(f, "serializer error; {}", e),
        }
@ -180,6 +191,102 @@ impl Database {
    }
 }

+struct RawIndex2 {
+    main: MainIndex,
+    words: WordsIndex,
+    documents: DocumentsIndex,
+}
+
+struct MainIndex(Arc<sled::Tree>);
+
+impl MainIndex {
+    fn schema(&self) -> Result<Option<Schema>, Error> {
+        match self.0.get("schema")? {
+            Some(bytes) => {
+                let schema = Schema::read_from_bin(bytes.as_ref())?;
+                Ok(Some(schema))
+            },
+            None => Ok(None),
+        }
+    }
+
+    fn words_set(&self) -> Result<Option<fst::Set>, Error> {
+        match self.0.get("words")? {
+            Some(bytes) => {
+                let len = bytes.len();
+                let value = bytes.into();
+                let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?;
+                Ok(Some(fst::Set::from(fst)))
+            },
+            None => Ok(None),
+        }
+    }
+
+    fn ranked_map(&self) -> Result<Option<RankedMap>, Error> {
+        match self.0.get("ranked-map")? {
+            Some(bytes) => {
+                let ranked_map = bincode::deserialize(bytes.as_ref())?;
+                Ok(Some(ranked_map))
+            },
+            None => Ok(None),
+        }
+    }
+}
+
+struct WordsIndex(Arc<sled::Tree>);
+
+impl WordsIndex {
+    fn doc_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Error> {
+        match self.0.get(word)? {
+            Some(bytes) => {
+                let layout = LayoutVerified::new_slice(bytes.as_ref()).expect("invalid layout");
+                let slice = layout.into_slice();
+                let setbuf = SetBuf::new_unchecked(slice.to_vec());
+                Ok(Some(setbuf))
+            },
+            None => Ok(None),
+        }
+    }
+}
+
+struct DocumentsIndex(Arc<sled::Tree>);
+
+impl DocumentsIndex {
+    fn document_field(&self, id: DocumentId, attr: SchemaAttr) -> Result<Option<IVec>, Error> {
+        let key = DocumentAttrKey::new(id, attr).to_be_bytes();
+        self.0.get(key).map_err(Into::into)
+    }
+
+    fn document_fields(&self, id: DocumentId) -> DocumentFieldsIter {
+        let start = DocumentAttrKey::new(id, SchemaAttr::min());
+        let start = start.to_be_bytes();
+
+        let end = DocumentAttrKey::new(id, SchemaAttr::max());
+        let end = end.to_be_bytes();
+
+        DocumentFieldsIter(self.0.range(start..=end))
+    }
+}
+
+pub struct DocumentFieldsIter<'a>(sled::Iter<'a>);
+
+impl<'a> Iterator for DocumentFieldsIter<'a> {
+    type Item = Result<(SchemaAttr, IVec), Error>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self.0.next() {
+            Some(Ok((key, value))) => {
+                let slice: &[u8] = key.as_ref();
+                let array = slice.try_into().unwrap();
+                let key = DocumentAttrKey::from_be_bytes(array);
+                Some(Ok((key.attribute, value)))
+            },
+            Some(Err(e)) => Some(Err(Error::SledError(e))),
+            None => None,
+        }
+    }
+}
+
 #[derive(Clone)]
 pub struct RawIndex {
    schema: Schema,
@ -294,23 +401,6 @@ impl RawIndex {
    }
 }

-pub struct DocumentFieldsIter<'a>(sled::Iter<'a>);
-
-impl<'a> Iterator for DocumentFieldsIter<'a> {
-    type Item = Result<(DocumentId, SchemaAttr, IVec), Error>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        match self.0.next() {
-            Some(Ok((key, value))) => {
-                let (id, attr) = extract_document_key(key).unwrap();
-                Some(Ok((id, attr, value)))
-            },
-            Some(Err(e)) => Some(Err(Error::SledError(e))),
-            None => None,
-        }
-    }
-}
-
 #[derive(Clone)]
 pub struct Index(RawIndex);

--- a/meilidb-data/src/document_attr_key.rs
+++ b/meilidb-data/src/document_attr_key.rs
@ -0,0 +1,69 @@
+use meilidb_core::DocumentId;
+use crate::schema::SchemaAttr;
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct DocumentAttrKey {
+    pub document_id: DocumentId,
+    pub attribute: SchemaAttr,
+}
+
+impl DocumentAttrKey {
+    pub fn new(document_id: DocumentId, attribute: SchemaAttr) -> DocumentAttrKey {
+        DocumentAttrKey { document_id, attribute }
+    }
+
+    pub fn to_be_bytes(self) -> [u8; 10] {
+        let mut output = [0u8; 10];
+
+        let document_id = self.document_id.0.to_be_bytes();
+        let attribute = self.attribute.0.to_be_bytes();
+
+        unsafe {
+            use std::{mem::size_of, ptr::copy_nonoverlapping};
+
+            let output = output.as_mut_ptr();
+            copy_nonoverlapping(document_id.as_ptr(), output, size_of::<u64>());
+
+            let output = output.add(size_of::<u64>());
+            copy_nonoverlapping(attribute.as_ptr(), output, size_of::<u16>());
+        }
+
+        output
+    }
+
+    pub fn from_be_bytes(bytes: [u8; 10]) -> DocumentAttrKey {
+        let document_id;
+        let attribute;
+
+        unsafe {
+            use std::ptr::read_unaligned;
+
+            let pointer = bytes.as_ptr() as *const _;
+            let document_id_bytes = read_unaligned(pointer);
+            document_id = u64::from_be_bytes(document_id_bytes);
+
+            let pointer = pointer.add(1) as *const _;
+            let attribute_bytes = read_unaligned(pointer);
+            attribute = u16::from_be_bytes(attribute_bytes);
+        }
+
+        DocumentAttrKey {
+            document_id: DocumentId(document_id),
+            attribute: SchemaAttr(attribute),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn to_from_be_bytes() {
+        let document_id = DocumentId(67578308);
+        let schema_attr = SchemaAttr(3456);
+        let x = DocumentAttrKey::new(document_id, schema_attr);
+
+        assert_eq!(x, DocumentAttrKey::from_be_bytes(x.to_be_bytes()));
+    }
+}
--- a/meilidb-data/src/lib.rs
+++ b/meilidb-data/src/lib.rs
@ -1,4 +1,5 @@
 mod database;
+mod document_attr_key;
 mod indexer;
 mod number;
 mod ranked_map;
--- a/meilidb-data/src/schema.rs
+++ b/meilidb-data/src/schema.rs
@ -186,12 +186,16 @@ impl Schema {
 pub struct SchemaAttr(pub u16);

 impl SchemaAttr {
-    pub fn new(value: u16) -> SchemaAttr {
+    pub const fn new(value: u16) -> SchemaAttr {
        SchemaAttr(value)
    }

-    pub fn min() -> SchemaAttr {
-        SchemaAttr(0)
+    pub const fn min() -> SchemaAttr {
+        SchemaAttr(u16::min_value())
+    }
+
+    pub const fn max() -> SchemaAttr {
+        SchemaAttr(u16::max_value())
    }

    pub fn next(self) -> Option<SchemaAttr> {
@ -201,10 +205,6 @@ impl SchemaAttr {
    pub fn prev(self) -> Option<SchemaAttr> {
        self.0.checked_sub(1).map(SchemaAttr)
    }
-
-    pub fn max() -> SchemaAttr {
-        SchemaAttr(u16::MAX)
-    }
 }

 impl fmt::Display for SchemaAttr {
--- a/meilidb-data/src/serde/deserializer.rs
+++ b/meilidb-data/src/serde/deserializer.rs
@ -45,7 +45,7 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a>
                },
            }
        });
-        let iter = document_attributes.filter_map(|(_, attr, value)| {
+        let iter = document_attributes.filter_map(|(attr, value)| {
            if self.fields.map_or(true, |f| f.contains(&attr)) {
                let attribute_name = self.raw_index.schema().attribute_name(attr);
                Some((attribute_name, Value::new(value)))