mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
chore: Rework the data module structures
being able to be constructed from SharedData
This commit is contained in:
parent
c022fa3fca
commit
64d53ee1bd
15 changed files with 292 additions and 262 deletions
|
@ -7,7 +7,7 @@ use rocksdb::rocksdb::{Writable, Snapshot};
|
|||
use rocksdb::{DB, DBVector, MergeOperands};
|
||||
use crossbeam::atomic::ArcCell;
|
||||
|
||||
use crate::database::index::{self, Index, Positive};
|
||||
use crate::database::index::Index;
|
||||
use crate::database::{DatabaseView, Update, Schema};
|
||||
use crate::database::{DATA_INDEX, DATA_SCHEMA};
|
||||
|
||||
|
@ -86,7 +86,7 @@ impl Database {
|
|||
};
|
||||
|
||||
let path = update.path().to_string_lossy();
|
||||
let mut options = IngestExternalFileOptions::new();
|
||||
let options = IngestExternalFileOptions::new();
|
||||
// options.move_files(move_update);
|
||||
|
||||
let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
|
||||
|
@ -182,7 +182,6 @@ mod tests {
|
|||
};
|
||||
|
||||
let database = Database::create(&rocksdb_path, schema.clone())?;
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
|
||||
let update_path = dir.path().join("update.sst");
|
||||
|
||||
|
@ -201,11 +200,12 @@ mod tests {
|
|||
|
||||
let docid0;
|
||||
let docid1;
|
||||
let mut update = {
|
||||
let update = {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
||||
|
||||
docid0 = builder.update_document(&doc0).unwrap();
|
||||
docid1 = builder.update_document(&doc1).unwrap();
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
|
|
|
@ -4,18 +4,16 @@ mod positive;
|
|||
pub(crate) use self::negative::Negative;
|
||||
pub(crate) use self::positive::{Positive, PositiveBuilder};
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::error::Error;
|
||||
use std::io::{Cursor, BufRead};
|
||||
use std::io::Cursor;
|
||||
use std::sync::Arc;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use sdset::duo::DifferenceByKey;
|
||||
use sdset::{Set, SetOperation};
|
||||
use fst::raw::Fst;
|
||||
use fst::Map;
|
||||
|
||||
use crate::data::{DocIds, DocIndexes};
|
||||
use crate::data::{SharedData, DocIndexes};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Index {
|
||||
|
@ -35,8 +33,11 @@ impl Index {
|
|||
len: usize,
|
||||
) -> Result<Index, Box<Error>>
|
||||
{
|
||||
let (negative, neg_offset) = Negative::from_shared_bytes(bytes.clone(), offset, len)?;
|
||||
let (positive, _) = Positive::from_shared_bytes(bytes, offset + neg_offset, len)?;
|
||||
let data = SharedData::new(bytes, offset, len);
|
||||
let mut cursor = Cursor::new(data);
|
||||
|
||||
let negative = Negative::from_cursor(&mut cursor)?;
|
||||
let positive = Positive::from_cursor(&mut cursor)?;
|
||||
Ok(Index { negative, positive })
|
||||
}
|
||||
|
||||
|
@ -71,7 +72,7 @@ impl Index {
|
|||
let (map, indexes) = builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Positive { map, indexes }
|
||||
Positive::new(map, indexes)
|
||||
};
|
||||
|
||||
let negative = Negative::default();
|
||||
|
|
|
@ -1,46 +1,36 @@
|
|||
use std::io::{Cursor, BufRead};
|
||||
use std::error::Error;
|
||||
use std::mem::size_of;
|
||||
use std::io::Cursor;
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
use sdset::Set;
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use byteorder::{LittleEndian, WriteBytesExt};
|
||||
|
||||
use crate::data::SharedData;
|
||||
use crate::data::DocIds;
|
||||
use crate::DocumentId;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Negative {
|
||||
pub doc_ids: DocIds,
|
||||
}
|
||||
pub struct Negative(DocIds);
|
||||
|
||||
impl Negative {
|
||||
pub fn from_shared_bytes(
|
||||
bytes: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
) -> Result<(Negative, usize), Box<Error>>
|
||||
{
|
||||
let mut cursor = Cursor::new(&bytes[..len]);
|
||||
cursor.consume(offset);
|
||||
pub fn new(doc_ids: DocIds) -> Negative {
|
||||
Negative(doc_ids)
|
||||
}
|
||||
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let doc_ids = DocIds::from_shared_bytes(bytes, offset, len)?;
|
||||
|
||||
Ok((Negative { doc_ids }, offset + len))
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Negative, Box<Error>> {
|
||||
let doc_ids = DocIds::from_cursor(cursor)?;
|
||||
Ok(Negative(doc_ids))
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let slice = self.doc_ids.as_bytes();
|
||||
let slice = self.0.as_bytes();
|
||||
let len = slice.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||
bytes.extend_from_slice(slice);
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.doc_ids.doc_ids().is_empty()
|
||||
self.0.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -48,6 +38,6 @@ impl Deref for Negative {
|
|||
type Target = Set<DocumentId>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.doc_ids.doc_ids()
|
||||
self.0.as_ref()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
use std::io::{Write, BufRead, Cursor};
|
||||
use std::mem::size_of;
|
||||
use std::error::Error;
|
||||
use std::sync::Arc;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fst::{map, Map, Streamer, IntoStreamer};
|
||||
|
@ -10,51 +8,51 @@ use sdset::duo::Union;
|
|||
use fst::raw::Fst;
|
||||
|
||||
use crate::data::{DocIndexes, DocIndexesBuilder};
|
||||
use crate::data::SharedData;
|
||||
use crate::DocIndex;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Positive {
|
||||
pub map: Map,
|
||||
pub indexes: DocIndexes,
|
||||
map: Map,
|
||||
indexes: DocIndexes,
|
||||
}
|
||||
|
||||
impl Positive {
|
||||
pub fn from_shared_bytes(
|
||||
bytes: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
) -> Result<(Positive, usize), Box<Error>>
|
||||
{
|
||||
let mut cursor = Cursor::new(&bytes[..len]);
|
||||
cursor.consume(offset);
|
||||
pub fn new(map: Map, indexes: DocIndexes) -> Positive {
|
||||
Positive { map, indexes }
|
||||
}
|
||||
|
||||
let map_len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let map_offset = cursor.position() as usize;
|
||||
let fst = Fst::from_shared_bytes(bytes.clone(), map_offset, map_len)?;
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Positive, Box<Error>> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let data = cursor.get_ref().range(offset, len);
|
||||
|
||||
let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
|
||||
let map = Map::from(fst);
|
||||
cursor.consume(len);
|
||||
|
||||
cursor.consume(map_len);
|
||||
let indexes_len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let indexes_offset = cursor.position() as usize;
|
||||
let indexes = DocIndexes::from_shared_bytes(bytes, indexes_offset, indexes_len)?;
|
||||
let indexes = DocIndexes::from_cursor(cursor)?;
|
||||
|
||||
let positive = Positive { map, indexes };
|
||||
let len = indexes_offset + indexes_len;
|
||||
|
||||
Ok((positive, len))
|
||||
Ok(Positive { map, indexes})
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
// indexes
|
||||
let slice = self.map.as_fst().as_bytes();
|
||||
let len = slice.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||
bytes.extend_from_slice(slice);
|
||||
|
||||
// map
|
||||
self.indexes.write_to_bytes(bytes);
|
||||
}
|
||||
|
||||
pub fn map(&self) -> &Map {
|
||||
&self.map
|
||||
}
|
||||
|
||||
pub fn indexes(&self) -> &DocIndexes {
|
||||
&self.indexes
|
||||
}
|
||||
|
||||
pub fn union(&self, other: &Positive) -> Result<Positive, Box<Error>> {
|
||||
let mut builder = PositiveBuilder::memory();
|
||||
let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
|
||||
|
@ -155,15 +153,11 @@ impl<W: Write, X: Write> PositiveBuilder<W, X> {
|
|||
where K: AsRef<[u8]>,
|
||||
{
|
||||
self.map.insert(key, self.value)?;
|
||||
self.indexes.insert(indexes)?;
|
||||
self.indexes.insert(indexes);
|
||||
self.value += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Result<(), Box<Error>> {
|
||||
self.into_inner().map(drop)
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
|
||||
let map = self.map.into_inner()?;
|
||||
let indexes = self.indexes.into_inner()?;
|
||||
|
|
|
@ -141,10 +141,12 @@ impl Schema {
|
|||
attributes
|
||||
}
|
||||
|
||||
pub fn document_id<T>(&self, document: &T) -> Result<DocumentId, Box<Error>>
|
||||
pub fn document_id<T>(&self, document: T) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
{
|
||||
unimplemented!()
|
||||
let id_attribute_name = &self.inner.identifier;
|
||||
let serializer = FindDocumentIdSerializer { id_attribute_name };
|
||||
document.serialize(serializer)
|
||||
}
|
||||
|
||||
pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
use crate::database::update::UnorderedPositiveBlobBuilder;
|
||||
use crate::database::update::RawUpdateBuilder;
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
|
@ -10,7 +10,7 @@ use serde::ser;
|
|||
|
||||
pub struct IndexerSerializer<'a, B> {
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
pub builder: &'a mut RawUpdateBuilder,
|
||||
pub document_id: DocumentId,
|
||||
pub attribute: SchemaAttr,
|
||||
}
|
||||
|
@ -72,10 +72,10 @@ where B: TokenizerBuilder
|
|||
// and the unidecoded lowercased version
|
||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||
if word_lower != word_unidecoded {
|
||||
self.builder.insert(word_unidecoded, doc_index);
|
||||
self.builder.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
|
||||
}
|
||||
|
||||
self.builder.insert(word_lower, doc_index);
|
||||
self.builder.insert_doc_index(word_lower.into_bytes(), doc_index);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -1,24 +1,20 @@
|
|||
use std::collections::BTreeMap;
|
||||
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
use crate::database::serde::indexer_serializer::IndexerSerializer;
|
||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||
use crate::database::update::UnorderedPositiveBlobBuilder;
|
||||
use crate::database::document_key::DocumentKeyAttr;
|
||||
use crate::database::update::NewState;
|
||||
use crate::database::Schema;
|
||||
use crate::database::update::RawUpdateBuilder;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::schema::Schema;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct Serializer<'a, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub document_id: DocumentId,
|
||||
pub builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
pub new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub builder: &'a mut RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for Serializer<'a, B>
|
||||
|
@ -145,7 +141,6 @@ where B: TokenizerBuilder
|
|||
document_id: self.document_id,
|
||||
current_key_name: None,
|
||||
builder: self.builder,
|
||||
new_states: self.new_states,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -160,7 +155,6 @@ where B: TokenizerBuilder
|
|||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
builder: self.builder,
|
||||
new_states: self.new_states,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -181,8 +175,7 @@ pub struct MapSerializer<'a, B> {
|
|||
pub tokenizer_builder: &'a B,
|
||||
pub document_id: DocumentId,
|
||||
pub current_key_name: Option<String>,
|
||||
pub builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
pub new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
|
||||
pub builder: &'a mut RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::SerializeMap for MapSerializer<'a, B>
|
||||
|
@ -220,7 +213,7 @@ where B: TokenizerBuilder
|
|||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
let key = DocumentKeyAttr::new(self.document_id, attr);
|
||||
self.new_states.insert(key, NewState::Updated { value });
|
||||
self.builder.insert_attribute_value(key, value);
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
|
@ -243,10 +236,9 @@ where B: TokenizerBuilder
|
|||
|
||||
pub struct StructSerializer<'a, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub document_id: DocumentId,
|
||||
pub builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
pub new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub builder: &'a mut RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
|
||||
|
@ -267,7 +259,7 @@ where B: TokenizerBuilder
|
|||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
let key = DocumentKeyAttr::new(self.document_id, attr);
|
||||
self.new_states.insert(key, NewState::Updated { value });
|
||||
self.builder.insert_attribute_value(key, value);
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
|
|
|
@ -1,95 +1,60 @@
|
|||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use fst::map::{Map, MapBuilder};
|
||||
use rocksdb::rocksdb_options;
|
||||
use serde::Serialize;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::database::index::{Index, Positive, PositiveBuilder, Negative};
|
||||
use crate::database::{DATA_INDEX, Schema, DocumentKeyAttr};
|
||||
use crate::data::{DocIds, DocIndexes};
|
||||
use crate::{DocumentId, DocIndex};
|
||||
use super::Update;
|
||||
use crate::database::serde::serializer::Serializer;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::Schema;
|
||||
|
||||
type Token = Vec<u8>; // TODO could be replaced by a SmallVec
|
||||
type Value = Vec<u8>;
|
||||
use crate::DocumentId;
|
||||
use super::{Update, RawUpdateBuilder};
|
||||
|
||||
pub struct UpdateBuilder {
|
||||
sst_file: PathBuf,
|
||||
schema: Schema,
|
||||
removed_documents: BTreeSet<DocumentId>,
|
||||
words_indexes: BTreeMap<Token, Vec<DocIndex>>,
|
||||
keys_values: BTreeMap<DocumentKeyAttr, Value>,
|
||||
raw_builder: RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl UpdateBuilder {
|
||||
pub fn new(path: PathBuf, schema: Schema) -> UpdateBuilder {
|
||||
UpdateBuilder {
|
||||
sst_file: path,
|
||||
schema: schema,
|
||||
removed_documents: BTreeSet::new(),
|
||||
words_indexes: BTreeMap::new(),
|
||||
keys_values: BTreeMap::new(),
|
||||
raw_builder: RawUpdateBuilder::new(path),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update_document<T>(&mut self, document: T) -> Result<DocumentId, Box<Error>>
|
||||
pub fn update_document<T, B>(
|
||||
&mut self,
|
||||
document: T,
|
||||
tokenizer_builder: &B,
|
||||
) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
B: TokenizerBuilder,
|
||||
{
|
||||
unimplemented!()
|
||||
let document_id = self.schema.document_id(&document)?;
|
||||
|
||||
let serializer = Serializer {
|
||||
schema: &self.schema,
|
||||
document_id: document_id,
|
||||
tokenizer_builder: tokenizer_builder,
|
||||
builder: &mut self.raw_builder,
|
||||
};
|
||||
|
||||
document.serialize(serializer)?;
|
||||
|
||||
Ok(document_id)
|
||||
}
|
||||
|
||||
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, Box<Error>>
|
||||
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
{
|
||||
unimplemented!()
|
||||
let document_id = self.schema.document_id(&document)?;
|
||||
self.raw_builder.remove_document(document_id);
|
||||
Ok(document_id)
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||
let tree = {
|
||||
let negative = {
|
||||
let documents_ids = self.removed_documents.into_iter().collect();
|
||||
let doc_ids = DocIds::from_raw(documents_ids);
|
||||
Negative { doc_ids }
|
||||
};
|
||||
|
||||
let positive = {
|
||||
let mut builder = PositiveBuilder::memory();
|
||||
|
||||
for (key, mut indexes) in self.words_indexes {
|
||||
indexes.sort_unstable();
|
||||
let indexes = Set::new_unchecked(&indexes);
|
||||
builder.insert(key, indexes);
|
||||
}
|
||||
|
||||
let (map, indexes) = builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Positive { map, indexes }
|
||||
};
|
||||
|
||||
Index { negative, positive }
|
||||
};
|
||||
|
||||
let env_options = rocksdb_options::EnvOptions::new();
|
||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
||||
file_writer.open(&self.sst_file.to_string_lossy())?;
|
||||
|
||||
// write the data-index
|
||||
let mut bytes = Vec::new();
|
||||
tree.write_to_bytes(&mut bytes);
|
||||
file_writer.merge(DATA_INDEX, &bytes)?;
|
||||
|
||||
// write all the documents attributes updates
|
||||
for (key, value) in self.keys_values {
|
||||
file_writer.put(key.as_ref(), &value)?;
|
||||
}
|
||||
|
||||
file_writer.finish()?;
|
||||
|
||||
Ok(Update { sst_file: self.sst_file })
|
||||
self.raw_builder.build()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
use std::path::{Path, PathBuf};
|
||||
|
||||
mod builder;
|
||||
mod raw_builder;
|
||||
|
||||
pub use self::builder::UpdateBuilder;
|
||||
pub use self::raw_builder::RawUpdateBuilder;
|
||||
|
||||
pub struct Update {
|
||||
sst_file: PathBuf,
|
||||
|
|
93
src/database/update/raw_builder.rs
Normal file
93
src/database/update/raw_builder.rs
Normal file
|
@ -0,0 +1,93 @@
|
|||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use rocksdb::rocksdb_options;
|
||||
use fst::map::Map;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::database::index::{Index, Positive, PositiveBuilder, Negative};
|
||||
use crate::database::{DATA_INDEX, DocumentKeyAttr};
|
||||
use crate::data::{DocIds, DocIndexes};
|
||||
use crate::{DocumentId, DocIndex};
|
||||
use super::Update;
|
||||
|
||||
type Token = Vec<u8>; // TODO could be replaced by a SmallVec
|
||||
type Value = Vec<u8>;
|
||||
|
||||
pub struct RawUpdateBuilder {
|
||||
sst_file: PathBuf,
|
||||
removed_documents: BTreeSet<DocumentId>,
|
||||
words_indexes: BTreeMap<Token, Vec<DocIndex>>,
|
||||
keys_values: BTreeMap<DocumentKeyAttr, Value>,
|
||||
}
|
||||
|
||||
impl RawUpdateBuilder {
|
||||
pub fn new(path: PathBuf) -> RawUpdateBuilder {
|
||||
RawUpdateBuilder {
|
||||
sst_file: path,
|
||||
removed_documents: BTreeSet::new(),
|
||||
words_indexes: BTreeMap::new(),
|
||||
keys_values: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_doc_index(&mut self, token: Vec<u8>, doc_index: DocIndex) {
|
||||
self.words_indexes.entry(token).or_insert_with(Vec::new).push(doc_index)
|
||||
}
|
||||
|
||||
pub fn insert_attribute_value(&mut self, key_attr: DocumentKeyAttr, value: Vec<u8>) -> Option<Vec<u8>> {
|
||||
self.keys_values.insert(key_attr, value)
|
||||
}
|
||||
|
||||
pub fn remove_document(&mut self, id: DocumentId) {
|
||||
self.removed_documents.insert(id);
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||
let tree = {
|
||||
let negative = {
|
||||
let documents_ids: Vec<_> = self.removed_documents.into_iter().collect();
|
||||
let documents_ids = Set::new_unchecked(&documents_ids);
|
||||
let doc_ids = DocIds::new(documents_ids);
|
||||
Negative::new(doc_ids)
|
||||
};
|
||||
|
||||
let positive = {
|
||||
let mut builder = PositiveBuilder::memory();
|
||||
|
||||
for (key, mut indexes) in self.words_indexes {
|
||||
indexes.sort_unstable();
|
||||
let indexes = Set::new_unchecked(&indexes);
|
||||
builder.insert(key, indexes)?;
|
||||
}
|
||||
|
||||
let (map, indexes) = builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Positive::new(map, indexes)
|
||||
};
|
||||
|
||||
Index { negative, positive }
|
||||
};
|
||||
|
||||
let env_options = rocksdb_options::EnvOptions::new();
|
||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
||||
file_writer.open(&self.sst_file.to_string_lossy())?;
|
||||
|
||||
// write the data-index
|
||||
let mut bytes = Vec::new();
|
||||
tree.write_to_bytes(&mut bytes);
|
||||
file_writer.merge(DATA_INDEX, &bytes)?;
|
||||
|
||||
// write all the documents attributes updates
|
||||
for (key, value) in self.keys_values {
|
||||
file_writer.put(key.as_ref(), &value)?;
|
||||
}
|
||||
|
||||
file_writer.finish()?;
|
||||
|
||||
Ok(Update { sst_file: self.sst_file })
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue