mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-03 18:31:38 +01:00
chore: Rework the data module structures
being able to be constructed from SharedData
This commit is contained in:
parent
c022fa3fca
commit
64d53ee1bd
@ -59,10 +59,10 @@ fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<D
|
||||
}
|
||||
};
|
||||
|
||||
update.update_document(&document).unwrap();
|
||||
update.update_document(&document, &tokenizer_builder)?;
|
||||
}
|
||||
|
||||
let mut update = update.build()?;
|
||||
let update = update.build()?;
|
||||
database.ingest_update_file(update)?;
|
||||
|
||||
Ok(database)
|
||||
|
@ -1,62 +1,53 @@
|
||||
use std::io::{self, Cursor, BufRead};
|
||||
use std::slice::from_raw_parts;
|
||||
use std::sync::Arc;
|
||||
use std::{io, mem};
|
||||
use std::mem::size_of;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use sdset::Set;
|
||||
|
||||
use crate::DocumentId;
|
||||
use crate::data::SharedData;
|
||||
use super::into_u8_slice;
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub struct DocIds {
|
||||
data: SharedData,
|
||||
}
|
||||
pub struct DocIds(SharedData);
|
||||
|
||||
impl DocIds {
|
||||
pub fn empty() -> Self {
|
||||
DocIds { data: SharedData::empty() }
|
||||
pub fn new(ids: &Set<DocumentId>) -> DocIds {
|
||||
let bytes = unsafe { into_u8_slice(ids.as_slice()) };
|
||||
let data = SharedData::from_bytes(bytes.to_vec());
|
||||
DocIds(data)
|
||||
}
|
||||
|
||||
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
|
||||
let len = vec.len();
|
||||
DocIds::from_shared_bytes(Arc::new(vec), 0, len)
|
||||
}
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIds> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let doc_ids = cursor.get_ref().range(offset, len);
|
||||
cursor.consume(len);
|
||||
|
||||
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> io::Result<Self> {
|
||||
let data = SharedData { bytes, offset, len };
|
||||
DocIds::from_data(data)
|
||||
}
|
||||
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
&self.data
|
||||
}
|
||||
|
||||
fn from_data(data: SharedData) -> io::Result<Self> {
|
||||
let len = data.as_ref().read_u64::<LittleEndian>()?;
|
||||
let data = data.range(mem::size_of::<u64>(), len as usize);
|
||||
Ok(DocIds { data })
|
||||
}
|
||||
|
||||
pub fn from_raw(vec: Vec<DocumentId>) -> Self {
|
||||
DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap()
|
||||
Ok(DocIds(doc_ids))
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let len = self.data.len() as u64;
|
||||
let len = self.0.len() as u64;
|
||||
bytes.write_u64::<LittleEndian>(len).unwrap();
|
||||
bytes.extend_from_slice(&self.data);
|
||||
bytes.extend_from_slice(&self.0);
|
||||
}
|
||||
|
||||
pub fn contains(&self, doc: DocumentId) -> bool {
|
||||
// FIXME prefer using the sdset::exponential_search function
|
||||
self.doc_ids().binary_search(&doc).is_ok()
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
pub fn doc_ids(&self) -> &Set<DocumentId> {
|
||||
let slice = &self.data;
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<Set<DocumentId>> for DocIds {
|
||||
fn as_ref(&self) -> &Set<DocumentId> {
|
||||
let slice = &self.0;
|
||||
let ptr = slice.as_ptr() as *const DocumentId;
|
||||
let len = slice.len() / mem::size_of::<DocumentId>();
|
||||
let len = slice.len() / size_of::<DocumentId>();
|
||||
let slice = unsafe { from_raw_parts(ptr, len) };
|
||||
Set::new_unchecked(slice)
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
use std::io::{self, Write, Cursor, BufRead};
|
||||
use std::slice::from_raw_parts;
|
||||
use std::io::{self, Write};
|
||||
use std::mem::size_of;
|
||||
use std::ops::Index;
|
||||
use std::sync::Arc;
|
||||
@ -9,6 +9,7 @@ use sdset::Set;
|
||||
|
||||
use crate::DocIndex;
|
||||
use crate::data::SharedData;
|
||||
use super::into_u8_slice;
|
||||
|
||||
#[derive(Debug)]
|
||||
#[repr(C)]
|
||||
@ -24,40 +25,36 @@ pub struct DocIndexes {
|
||||
}
|
||||
|
||||
impl DocIndexes {
|
||||
pub fn from_bytes(vec: Vec<u8>) -> io::Result<DocIndexes> {
|
||||
let len = vec.len();
|
||||
DocIndexes::from_shared_bytes(Arc::new(vec), 0, len)
|
||||
pub fn from_bytes(bytes: Vec<u8>) -> io::Result<DocIndexes> {
|
||||
let bytes = Arc::new(bytes);
|
||||
let len = bytes.len();
|
||||
let data = SharedData::new(bytes, 0, len);
|
||||
let mut cursor = Cursor::new(data);
|
||||
DocIndexes::from_cursor(&mut cursor)
|
||||
}
|
||||
|
||||
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> io::Result<DocIndexes> {
|
||||
let data = SharedData { bytes, offset, len };
|
||||
DocIndexes::from_data(data)
|
||||
}
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIndexes> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let ranges = cursor.get_ref().range(offset, len);
|
||||
cursor.consume(len);
|
||||
|
||||
fn from_data(data: SharedData) -> io::Result<DocIndexes> {
|
||||
let ranges_len_offset = data.len() - size_of::<u64>();
|
||||
let ranges_len = (&data[ranges_len_offset..]).read_u64::<LittleEndian>()?;
|
||||
let ranges_len = ranges_len as usize;
|
||||
|
||||
let ranges_offset = ranges_len_offset - ranges_len;
|
||||
let ranges = data.range(ranges_offset, ranges_len);
|
||||
|
||||
let indexes = data.range(0, ranges_offset);
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let indexes = cursor.get_ref().range(offset, len);
|
||||
cursor.consume(len);
|
||||
|
||||
Ok(DocIndexes { ranges, indexes })
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let ranges_len = self.ranges.len() as u64;
|
||||
let indexes_len = self.indexes.len() as u64;
|
||||
let u64_size = size_of::<u64>() as u64;
|
||||
let len = indexes_len + ranges_len + u64_size;
|
||||
|
||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||
|
||||
bytes.extend_from_slice(&self.indexes);
|
||||
bytes.extend_from_slice(&self.ranges);
|
||||
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
|
||||
bytes.extend_from_slice(&self.ranges);
|
||||
|
||||
let indexes_len = self.indexes.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
|
||||
bytes.extend_from_slice(&self.indexes);
|
||||
}
|
||||
|
||||
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
|
||||
@ -97,12 +94,17 @@ impl Index<usize> for DocIndexes {
|
||||
|
||||
pub struct DocIndexesBuilder<W> {
|
||||
ranges: Vec<Range>,
|
||||
indexes: Vec<DocIndex>,
|
||||
wtr: W,
|
||||
}
|
||||
|
||||
impl DocIndexesBuilder<Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
DocIndexesBuilder::new(Vec::new())
|
||||
DocIndexesBuilder {
|
||||
ranges: Vec::new(),
|
||||
indexes: Vec::new(),
|
||||
wtr: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -110,19 +112,18 @@ impl<W: Write> DocIndexesBuilder<W> {
|
||||
pub fn new(wtr: W) -> Self {
|
||||
DocIndexesBuilder {
|
||||
ranges: Vec::new(),
|
||||
indexes: Vec::new(),
|
||||
wtr: wtr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, indexes: &Set<DocIndex>) -> io::Result<()> {
|
||||
pub fn insert(&mut self, indexes: &Set<DocIndex>) {
|
||||
let len = indexes.len() as u64;
|
||||
let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
|
||||
let range = Range { start, end: start + len };
|
||||
self.ranges.push(range);
|
||||
|
||||
// write the values
|
||||
let indexes = unsafe { into_u8_slice(indexes) };
|
||||
self.wtr.write_all(indexes)
|
||||
self.indexes.extend_from_slice(indexes);
|
||||
}
|
||||
|
||||
pub fn finish(self) -> io::Result<()> {
|
||||
@ -130,24 +131,20 @@ impl<W: Write> DocIndexesBuilder<W> {
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> io::Result<W> {
|
||||
// write the ranges
|
||||
let ranges = unsafe { into_u8_slice(self.ranges.as_slice()) };
|
||||
self.wtr.write_all(ranges)?;
|
||||
|
||||
// write the length of the ranges
|
||||
let ranges = unsafe { into_u8_slice(&self.ranges) };
|
||||
let len = ranges.len() as u64;
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
self.wtr.write_all(ranges)?;
|
||||
|
||||
let indexes = unsafe { into_u8_slice(&self.indexes) };
|
||||
let len = indexes.len() as u64;
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
self.wtr.write_all(indexes)?;
|
||||
|
||||
Ok(self.wtr)
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
|
||||
let ptr = slice.as_ptr() as *const u8;
|
||||
let len = slice.len() * size_of::<T>();
|
||||
from_raw_parts(ptr, len)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@ -177,9 +174,9 @@ mod tests {
|
||||
|
||||
let mut builder = DocIndexesBuilder::memory();
|
||||
|
||||
builder.insert(Set::new(&[a])?)?;
|
||||
builder.insert(Set::new(&[a, b, c])?)?;
|
||||
builder.insert(Set::new(&[a, c])?)?;
|
||||
builder.insert(Set::new(&[a])?);
|
||||
builder.insert(Set::new(&[a, b, c])?);
|
||||
builder.insert(Set::new(&[a, c])?);
|
||||
|
||||
let bytes = builder.into_inner()?;
|
||||
let docs = DocIndexes::from_bytes(bytes)?;
|
||||
@ -212,18 +209,17 @@ mod tests {
|
||||
|
||||
let mut builder = DocIndexesBuilder::memory();
|
||||
|
||||
builder.insert(Set::new(&[a])?)?;
|
||||
builder.insert(Set::new(&[a, b, c])?)?;
|
||||
builder.insert(Set::new(&[a, c])?)?;
|
||||
builder.insert(Set::new(&[a])?);
|
||||
builder.insert(Set::new(&[a, b, c])?);
|
||||
builder.insert(Set::new(&[a, c])?);
|
||||
|
||||
let builder_bytes = builder.into_inner()?;
|
||||
let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
|
||||
|
||||
let mut bytes = Vec::new();
|
||||
docs.write_to_bytes(&mut bytes);
|
||||
let len = size_of::<u64>();
|
||||
|
||||
assert_eq!(builder_bytes, &bytes[len..]);
|
||||
assert_eq!(builder_bytes, bytes);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -1,26 +1,30 @@
|
||||
mod doc_ids;
|
||||
mod doc_indexes;
|
||||
|
||||
use std::slice::from_raw_parts;
|
||||
use std::mem::size_of;
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub use self::doc_ids::DocIds;
|
||||
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
||||
|
||||
#[derive(Clone)]
|
||||
struct SharedData {
|
||||
bytes: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
#[derive(Default, Clone)]
|
||||
pub struct SharedData {
|
||||
pub bytes: Arc<Vec<u8>>,
|
||||
pub offset: usize,
|
||||
pub len: usize,
|
||||
}
|
||||
|
||||
impl SharedData {
|
||||
pub fn empty() -> SharedData {
|
||||
SharedData {
|
||||
bytes: Arc::default(),
|
||||
offset: 0,
|
||||
len: 0,
|
||||
pub fn from_bytes(vec: Vec<u8>) -> SharedData {
|
||||
let len = vec.len();
|
||||
let bytes = Arc::new(vec);
|
||||
SharedData::new(bytes, 0, len)
|
||||
}
|
||||
|
||||
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
|
||||
SharedData { bytes, offset, len }
|
||||
}
|
||||
|
||||
pub fn range(&self, offset: usize, len: usize) -> SharedData {
|
||||
@ -33,12 +37,6 @@ impl SharedData {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SharedData {
|
||||
fn default() -> SharedData {
|
||||
SharedData::empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for SharedData {
|
||||
type Target = [u8];
|
||||
|
||||
@ -52,3 +50,9 @@ impl AsRef<[u8]> for SharedData {
|
||||
&self.bytes[self.offset..self.offset + self.len]
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
|
||||
let ptr = slice.as_ptr() as *const u8;
|
||||
let len = slice.len() * size_of::<T>();
|
||||
from_raw_parts(ptr, len)
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ use rocksdb::rocksdb::{Writable, Snapshot};
|
||||
use rocksdb::{DB, DBVector, MergeOperands};
|
||||
use crossbeam::atomic::ArcCell;
|
||||
|
||||
use crate::database::index::{self, Index, Positive};
|
||||
use crate::database::index::Index;
|
||||
use crate::database::{DatabaseView, Update, Schema};
|
||||
use crate::database::{DATA_INDEX, DATA_SCHEMA};
|
||||
|
||||
@ -86,7 +86,7 @@ impl Database {
|
||||
};
|
||||
|
||||
let path = update.path().to_string_lossy();
|
||||
let mut options = IngestExternalFileOptions::new();
|
||||
let options = IngestExternalFileOptions::new();
|
||||
// options.move_files(move_update);
|
||||
|
||||
let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
|
||||
@ -182,7 +182,6 @@ mod tests {
|
||||
};
|
||||
|
||||
let database = Database::create(&rocksdb_path, schema.clone())?;
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
|
||||
let update_path = dir.path().join("update.sst");
|
||||
|
||||
@ -201,11 +200,12 @@ mod tests {
|
||||
|
||||
let docid0;
|
||||
let docid1;
|
||||
let mut update = {
|
||||
let update = {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
||||
|
||||
docid0 = builder.update_document(&doc0).unwrap();
|
||||
docid1 = builder.update_document(&doc1).unwrap();
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
|
@ -4,18 +4,16 @@ mod positive;
|
||||
pub(crate) use self::negative::Negative;
|
||||
pub(crate) use self::positive::{Positive, PositiveBuilder};
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::error::Error;
|
||||
use std::io::{Cursor, BufRead};
|
||||
use std::io::Cursor;
|
||||
use std::sync::Arc;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use sdset::duo::DifferenceByKey;
|
||||
use sdset::{Set, SetOperation};
|
||||
use fst::raw::Fst;
|
||||
use fst::Map;
|
||||
|
||||
use crate::data::{DocIds, DocIndexes};
|
||||
use crate::data::{SharedData, DocIndexes};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Index {
|
||||
@ -35,8 +33,11 @@ impl Index {
|
||||
len: usize,
|
||||
) -> Result<Index, Box<Error>>
|
||||
{
|
||||
let (negative, neg_offset) = Negative::from_shared_bytes(bytes.clone(), offset, len)?;
|
||||
let (positive, _) = Positive::from_shared_bytes(bytes, offset + neg_offset, len)?;
|
||||
let data = SharedData::new(bytes, offset, len);
|
||||
let mut cursor = Cursor::new(data);
|
||||
|
||||
let negative = Negative::from_cursor(&mut cursor)?;
|
||||
let positive = Positive::from_cursor(&mut cursor)?;
|
||||
Ok(Index { negative, positive })
|
||||
}
|
||||
|
||||
@ -71,7 +72,7 @@ impl Index {
|
||||
let (map, indexes) = builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Positive { map, indexes }
|
||||
Positive::new(map, indexes)
|
||||
};
|
||||
|
||||
let negative = Negative::default();
|
||||
|
@ -1,46 +1,36 @@
|
||||
use std::io::{Cursor, BufRead};
|
||||
use std::error::Error;
|
||||
use std::mem::size_of;
|
||||
use std::io::Cursor;
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
use sdset::Set;
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use byteorder::{LittleEndian, WriteBytesExt};
|
||||
|
||||
use crate::data::SharedData;
|
||||
use crate::data::DocIds;
|
||||
use crate::DocumentId;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Negative {
|
||||
pub doc_ids: DocIds,
|
||||
}
|
||||
pub struct Negative(DocIds);
|
||||
|
||||
impl Negative {
|
||||
pub fn from_shared_bytes(
|
||||
bytes: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
) -> Result<(Negative, usize), Box<Error>>
|
||||
{
|
||||
let mut cursor = Cursor::new(&bytes[..len]);
|
||||
cursor.consume(offset);
|
||||
pub fn new(doc_ids: DocIds) -> Negative {
|
||||
Negative(doc_ids)
|
||||
}
|
||||
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let doc_ids = DocIds::from_shared_bytes(bytes, offset, len)?;
|
||||
|
||||
Ok((Negative { doc_ids }, offset + len))
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Negative, Box<Error>> {
|
||||
let doc_ids = DocIds::from_cursor(cursor)?;
|
||||
Ok(Negative(doc_ids))
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let slice = self.doc_ids.as_bytes();
|
||||
let slice = self.0.as_bytes();
|
||||
let len = slice.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||
bytes.extend_from_slice(slice);
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.doc_ids.doc_ids().is_empty()
|
||||
self.0.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
@ -48,6 +38,6 @@ impl Deref for Negative {
|
||||
type Target = Set<DocumentId>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.doc_ids.doc_ids()
|
||||
self.0.as_ref()
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,5 @@
|
||||
use std::io::{Write, BufRead, Cursor};
|
||||
use std::mem::size_of;
|
||||
use std::error::Error;
|
||||
use std::sync::Arc;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fst::{map, Map, Streamer, IntoStreamer};
|
||||
@ -10,51 +8,51 @@ use sdset::duo::Union;
|
||||
use fst::raw::Fst;
|
||||
|
||||
use crate::data::{DocIndexes, DocIndexesBuilder};
|
||||
use crate::data::SharedData;
|
||||
use crate::DocIndex;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Positive {
|
||||
pub map: Map,
|
||||
pub indexes: DocIndexes,
|
||||
map: Map,
|
||||
indexes: DocIndexes,
|
||||
}
|
||||
|
||||
impl Positive {
|
||||
pub fn from_shared_bytes(
|
||||
bytes: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
) -> Result<(Positive, usize), Box<Error>>
|
||||
{
|
||||
let mut cursor = Cursor::new(&bytes[..len]);
|
||||
cursor.consume(offset);
|
||||
pub fn new(map: Map, indexes: DocIndexes) -> Positive {
|
||||
Positive { map, indexes }
|
||||
}
|
||||
|
||||
let map_len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let map_offset = cursor.position() as usize;
|
||||
let fst = Fst::from_shared_bytes(bytes.clone(), map_offset, map_len)?;
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Positive, Box<Error>> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let data = cursor.get_ref().range(offset, len);
|
||||
|
||||
let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
|
||||
let map = Map::from(fst);
|
||||
cursor.consume(len);
|
||||
|
||||
cursor.consume(map_len);
|
||||
let indexes_len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let indexes_offset = cursor.position() as usize;
|
||||
let indexes = DocIndexes::from_shared_bytes(bytes, indexes_offset, indexes_len)?;
|
||||
let indexes = DocIndexes::from_cursor(cursor)?;
|
||||
|
||||
let positive = Positive { map, indexes };
|
||||
let len = indexes_offset + indexes_len;
|
||||
|
||||
Ok((positive, len))
|
||||
Ok(Positive { map, indexes})
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
// indexes
|
||||
let slice = self.map.as_fst().as_bytes();
|
||||
let len = slice.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||
bytes.extend_from_slice(slice);
|
||||
|
||||
// map
|
||||
self.indexes.write_to_bytes(bytes);
|
||||
}
|
||||
|
||||
pub fn map(&self) -> &Map {
|
||||
&self.map
|
||||
}
|
||||
|
||||
pub fn indexes(&self) -> &DocIndexes {
|
||||
&self.indexes
|
||||
}
|
||||
|
||||
pub fn union(&self, other: &Positive) -> Result<Positive, Box<Error>> {
|
||||
let mut builder = PositiveBuilder::memory();
|
||||
let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
|
||||
@ -155,15 +153,11 @@ impl<W: Write, X: Write> PositiveBuilder<W, X> {
|
||||
where K: AsRef<[u8]>,
|
||||
{
|
||||
self.map.insert(key, self.value)?;
|
||||
self.indexes.insert(indexes)?;
|
||||
self.indexes.insert(indexes);
|
||||
self.value += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Result<(), Box<Error>> {
|
||||
self.into_inner().map(drop)
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
|
||||
let map = self.map.into_inner()?;
|
||||
let indexes = self.indexes.into_inner()?;
|
||||
|
@ -141,10 +141,12 @@ impl Schema {
|
||||
attributes
|
||||
}
|
||||
|
||||
pub fn document_id<T>(&self, document: &T) -> Result<DocumentId, Box<Error>>
|
||||
pub fn document_id<T>(&self, document: T) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
{
|
||||
unimplemented!()
|
||||
let id_attribute_name = &self.inner.identifier;
|
||||
let serializer = FindDocumentIdSerializer { id_attribute_name };
|
||||
document.serialize(serializer)
|
||||
}
|
||||
|
||||
pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
|
||||
|
@ -1,4 +1,4 @@
|
||||
use crate::database::update::UnorderedPositiveBlobBuilder;
|
||||
use crate::database::update::RawUpdateBuilder;
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
@ -10,7 +10,7 @@ use serde::ser;
|
||||
|
||||
pub struct IndexerSerializer<'a, B> {
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
pub builder: &'a mut RawUpdateBuilder,
|
||||
pub document_id: DocumentId,
|
||||
pub attribute: SchemaAttr,
|
||||
}
|
||||
@ -72,10 +72,10 @@ where B: TokenizerBuilder
|
||||
// and the unidecoded lowercased version
|
||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||
if word_lower != word_unidecoded {
|
||||
self.builder.insert(word_unidecoded, doc_index);
|
||||
self.builder.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
|
||||
}
|
||||
|
||||
self.builder.insert(word_lower, doc_index);
|
||||
self.builder.insert_doc_index(word_lower.into_bytes(), doc_index);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
@ -1,24 +1,20 @@
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
use crate::database::serde::indexer_serializer::IndexerSerializer;
|
||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||
use crate::database::update::UnorderedPositiveBlobBuilder;
|
||||
use crate::database::document_key::DocumentKeyAttr;
|
||||
use crate::database::update::NewState;
|
||||
use crate::database::Schema;
|
||||
use crate::database::update::RawUpdateBuilder;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::schema::Schema;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct Serializer<'a, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub document_id: DocumentId,
|
||||
pub builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
pub new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub builder: &'a mut RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for Serializer<'a, B>
|
||||
@ -145,7 +141,6 @@ where B: TokenizerBuilder
|
||||
document_id: self.document_id,
|
||||
current_key_name: None,
|
||||
builder: self.builder,
|
||||
new_states: self.new_states,
|
||||
})
|
||||
}
|
||||
|
||||
@ -160,7 +155,6 @@ where B: TokenizerBuilder
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
builder: self.builder,
|
||||
new_states: self.new_states,
|
||||
})
|
||||
}
|
||||
|
||||
@ -181,8 +175,7 @@ pub struct MapSerializer<'a, B> {
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub document_id: DocumentId,
|
||||
pub current_key_name: Option<String>,
|
||||
pub builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
pub new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
|
||||
pub builder: &'a mut RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::SerializeMap for MapSerializer<'a, B>
|
||||
@ -220,7 +213,7 @@ where B: TokenizerBuilder
|
||||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
let key = DocumentKeyAttr::new(self.document_id, attr);
|
||||
self.new_states.insert(key, NewState::Updated { value });
|
||||
self.builder.insert_attribute_value(key, value);
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
@ -243,10 +236,9 @@ where B: TokenizerBuilder
|
||||
|
||||
pub struct StructSerializer<'a, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub document_id: DocumentId,
|
||||
pub builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
pub new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub builder: &'a mut RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
|
||||
@ -267,7 +259,7 @@ where B: TokenizerBuilder
|
||||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
let key = DocumentKeyAttr::new(self.document_id, attr);
|
||||
self.new_states.insert(key, NewState::Updated { value });
|
||||
self.builder.insert_attribute_value(key, value);
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
|
@ -1,95 +1,60 @@
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use fst::map::{Map, MapBuilder};
|
||||
use rocksdb::rocksdb_options;
|
||||
use serde::Serialize;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::database::index::{Index, Positive, PositiveBuilder, Negative};
|
||||
use crate::database::{DATA_INDEX, Schema, DocumentKeyAttr};
|
||||
use crate::data::{DocIds, DocIndexes};
|
||||
use crate::{DocumentId, DocIndex};
|
||||
use super::Update;
|
||||
use crate::database::serde::serializer::Serializer;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::Schema;
|
||||
|
||||
type Token = Vec<u8>; // TODO could be replaced by a SmallVec
|
||||
type Value = Vec<u8>;
|
||||
use crate::DocumentId;
|
||||
use super::{Update, RawUpdateBuilder};
|
||||
|
||||
pub struct UpdateBuilder {
|
||||
sst_file: PathBuf,
|
||||
schema: Schema,
|
||||
removed_documents: BTreeSet<DocumentId>,
|
||||
words_indexes: BTreeMap<Token, Vec<DocIndex>>,
|
||||
keys_values: BTreeMap<DocumentKeyAttr, Value>,
|
||||
raw_builder: RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl UpdateBuilder {
|
||||
pub fn new(path: PathBuf, schema: Schema) -> UpdateBuilder {
|
||||
UpdateBuilder {
|
||||
sst_file: path,
|
||||
schema: schema,
|
||||
removed_documents: BTreeSet::new(),
|
||||
words_indexes: BTreeMap::new(),
|
||||
keys_values: BTreeMap::new(),
|
||||
raw_builder: RawUpdateBuilder::new(path),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update_document<T>(&mut self, document: T) -> Result<DocumentId, Box<Error>>
|
||||
pub fn update_document<T, B>(
|
||||
&mut self,
|
||||
document: T,
|
||||
tokenizer_builder: &B,
|
||||
) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
B: TokenizerBuilder,
|
||||
{
|
||||
unimplemented!()
|
||||
let document_id = self.schema.document_id(&document)?;
|
||||
|
||||
let serializer = Serializer {
|
||||
schema: &self.schema,
|
||||
document_id: document_id,
|
||||
tokenizer_builder: tokenizer_builder,
|
||||
builder: &mut self.raw_builder,
|
||||
};
|
||||
|
||||
document.serialize(serializer)?;
|
||||
|
||||
Ok(document_id)
|
||||
}
|
||||
|
||||
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, Box<Error>>
|
||||
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
{
|
||||
unimplemented!()
|
||||
let document_id = self.schema.document_id(&document)?;
|
||||
self.raw_builder.remove_document(document_id);
|
||||
Ok(document_id)
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||
let tree = {
|
||||
let negative = {
|
||||
let documents_ids = self.removed_documents.into_iter().collect();
|
||||
let doc_ids = DocIds::from_raw(documents_ids);
|
||||
Negative { doc_ids }
|
||||
};
|
||||
|
||||
let positive = {
|
||||
let mut builder = PositiveBuilder::memory();
|
||||
|
||||
for (key, mut indexes) in self.words_indexes {
|
||||
indexes.sort_unstable();
|
||||
let indexes = Set::new_unchecked(&indexes);
|
||||
builder.insert(key, indexes);
|
||||
}
|
||||
|
||||
let (map, indexes) = builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Positive { map, indexes }
|
||||
};
|
||||
|
||||
Index { negative, positive }
|
||||
};
|
||||
|
||||
let env_options = rocksdb_options::EnvOptions::new();
|
||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
||||
file_writer.open(&self.sst_file.to_string_lossy())?;
|
||||
|
||||
// write the data-index
|
||||
let mut bytes = Vec::new();
|
||||
tree.write_to_bytes(&mut bytes);
|
||||
file_writer.merge(DATA_INDEX, &bytes)?;
|
||||
|
||||
// write all the documents attributes updates
|
||||
for (key, value) in self.keys_values {
|
||||
file_writer.put(key.as_ref(), &value)?;
|
||||
}
|
||||
|
||||
file_writer.finish()?;
|
||||
|
||||
Ok(Update { sst_file: self.sst_file })
|
||||
self.raw_builder.build()
|
||||
}
|
||||
}
|
||||
|
@ -1,8 +1,10 @@
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
mod builder;
|
||||
mod raw_builder;
|
||||
|
||||
pub use self::builder::UpdateBuilder;
|
||||
pub use self::raw_builder::RawUpdateBuilder;
|
||||
|
||||
pub struct Update {
|
||||
sst_file: PathBuf,
|
||||
|
93
src/database/update/raw_builder.rs
Normal file
93
src/database/update/raw_builder.rs
Normal file
@ -0,0 +1,93 @@
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use rocksdb::rocksdb_options;
|
||||
use fst::map::Map;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::database::index::{Index, Positive, PositiveBuilder, Negative};
|
||||
use crate::database::{DATA_INDEX, DocumentKeyAttr};
|
||||
use crate::data::{DocIds, DocIndexes};
|
||||
use crate::{DocumentId, DocIndex};
|
||||
use super::Update;
|
||||
|
||||
type Token = Vec<u8>; // TODO could be replaced by a SmallVec
|
||||
type Value = Vec<u8>;
|
||||
|
||||
pub struct RawUpdateBuilder {
|
||||
sst_file: PathBuf,
|
||||
removed_documents: BTreeSet<DocumentId>,
|
||||
words_indexes: BTreeMap<Token, Vec<DocIndex>>,
|
||||
keys_values: BTreeMap<DocumentKeyAttr, Value>,
|
||||
}
|
||||
|
||||
impl RawUpdateBuilder {
|
||||
pub fn new(path: PathBuf) -> RawUpdateBuilder {
|
||||
RawUpdateBuilder {
|
||||
sst_file: path,
|
||||
removed_documents: BTreeSet::new(),
|
||||
words_indexes: BTreeMap::new(),
|
||||
keys_values: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_doc_index(&mut self, token: Vec<u8>, doc_index: DocIndex) {
|
||||
self.words_indexes.entry(token).or_insert_with(Vec::new).push(doc_index)
|
||||
}
|
||||
|
||||
pub fn insert_attribute_value(&mut self, key_attr: DocumentKeyAttr, value: Vec<u8>) -> Option<Vec<u8>> {
|
||||
self.keys_values.insert(key_attr, value)
|
||||
}
|
||||
|
||||
pub fn remove_document(&mut self, id: DocumentId) {
|
||||
self.removed_documents.insert(id);
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||
let tree = {
|
||||
let negative = {
|
||||
let documents_ids: Vec<_> = self.removed_documents.into_iter().collect();
|
||||
let documents_ids = Set::new_unchecked(&documents_ids);
|
||||
let doc_ids = DocIds::new(documents_ids);
|
||||
Negative::new(doc_ids)
|
||||
};
|
||||
|
||||
let positive = {
|
||||
let mut builder = PositiveBuilder::memory();
|
||||
|
||||
for (key, mut indexes) in self.words_indexes {
|
||||
indexes.sort_unstable();
|
||||
let indexes = Set::new_unchecked(&indexes);
|
||||
builder.insert(key, indexes)?;
|
||||
}
|
||||
|
||||
let (map, indexes) = builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Positive::new(map, indexes)
|
||||
};
|
||||
|
||||
Index { negative, positive }
|
||||
};
|
||||
|
||||
let env_options = rocksdb_options::EnvOptions::new();
|
||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
||||
file_writer.open(&self.sst_file.to_string_lossy())?;
|
||||
|
||||
// write the data-index
|
||||
let mut bytes = Vec::new();
|
||||
tree.write_to_bytes(&mut bytes);
|
||||
file_writer.merge(DATA_INDEX, &bytes)?;
|
||||
|
||||
// write all the documents attributes updates
|
||||
for (key, value) in self.keys_values {
|
||||
file_writer.put(key.as_ref(), &value)?;
|
||||
}
|
||||
|
||||
file_writer.finish()?;
|
||||
|
||||
Ok(Update { sst_file: self.sst_file })
|
||||
}
|
||||
}
|
@ -86,7 +86,7 @@ where D: Deref<Target=DB>,
|
||||
let mut stream = {
|
||||
let mut op_builder = fst::map::OpBuilder::new();
|
||||
for automaton in &automatons {
|
||||
let stream = self.view.index().positive.map.search(automaton);
|
||||
let stream = self.view.index().positive.map().search(automaton);
|
||||
op_builder.push(stream);
|
||||
}
|
||||
op_builder.union()
|
||||
@ -100,7 +100,7 @@ where D: Deref<Target=DB>,
|
||||
let distance = automaton.eval(input).to_u8();
|
||||
let is_exact = distance == 0 && input.len() == automaton.query_len();
|
||||
|
||||
let doc_indexes = &self.view.index().positive.indexes;
|
||||
let doc_indexes = &self.view.index().positive.indexes();
|
||||
let doc_indexes = &doc_indexes[iv.value as usize];
|
||||
|
||||
for doc_index in doc_indexes {
|
||||
|
Loading…
x
Reference in New Issue
Block a user