feat: Introduce the "data-index" entry with merge compaction

This commit is contained in:
Clément Renault 2018-11-26 17:30:19 +01:00
parent 0e856db4e6
commit b636e5fe57
No known key found for this signature in database
GPG key ID: 0151CDAB43460DAE
12 changed files with 251 additions and 231 deletions

View file

@ -35,6 +35,10 @@ impl DocIds {
Ok(DocIds { data })
}
pub fn from_document_ids(vec: Vec<DocumentId>) -> Self {
DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap()
}
pub fn contains(&self, doc: DocumentId) -> bool {
// FIXME prefer using the sdset::exponential_search function
self.doc_ids().binary_search(&doc).is_ok()

View file

@ -19,7 +19,7 @@ struct Range {
end: u64,
}
#[derive(Clone)]
#[derive(Clone, Default)]
pub struct DocIndexes {
ranges: Data,
indexes: Data,
@ -29,15 +29,14 @@ impl DocIndexes {
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let mmap = MmapReadOnly::open_path(path)?;
let range_len = mmap.as_slice().read_u64::<LittleEndian>()?;
let range_len = range_len as usize * mem::size_of::<Range>();
let ranges_len_offset = mmap.as_slice().len() - mem::size_of::<u64>();
let ranges_len = (&mmap.as_slice()[ranges_len_offset..]).read_u64::<LittleEndian>()?;
let ranges_len = ranges_len as usize * mem::size_of::<Range>();
let offset = mem::size_of::<u64>() as usize;
let ranges = Data::Mmap(mmap.range(offset, range_len));
let ranges_offset = ranges_len_offset - ranges_len;
let ranges = Data::Mmap(mmap.range(ranges_offset, ranges_len));
let len = mmap.len() - range_len - offset;
let offset = offset + range_len;
let indexes = Data::Mmap(mmap.range(offset, len));
let indexes = Data::Mmap(mmap.range(0, ranges_offset));
Ok(DocIndexes { ranges, indexes })
}
@ -45,19 +44,22 @@ impl DocIndexes {
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
let vec = Arc::new(vec);
let range_len = vec.as_slice().read_u64::<LittleEndian>()?;
let range_len = range_len as usize * mem::size_of::<Range>();
let ranges_len_offset = vec.len() - mem::size_of::<u64>();
let ranges_len = (&vec[ranges_len_offset..]).read_u64::<LittleEndian>()?;
let ranges_len = ranges_len as usize * mem::size_of::<Range>();
let offset = mem::size_of::<u64>() as usize;
let ranges_offset = ranges_len_offset - ranges_len;
let ranges = Data::Shared {
vec: vec.clone(),
offset,
len: range_len
offset: ranges_offset,
len: ranges_len,
};
let len = vec.len() - range_len - offset;
let offset = offset + range_len;
let indexes = Data::Shared { vec, offset, len };
let indexes = Data::Shared {
vec: vec,
offset: 0,
len: ranges_offset,
};
Ok(DocIndexes { ranges, indexes })
}
@ -94,6 +96,53 @@ impl Serialize for DocIndexes {
}
}
pub struct RawDocIndexesBuilder<W> {
ranges: Vec<Range>,
wtr: W,
}
impl RawDocIndexesBuilder<Vec<u8>> {
pub fn memory() -> Self {
RawDocIndexesBuilder::new(Vec::new())
}
}
impl<W: Write> RawDocIndexesBuilder<W> {
pub fn new(wtr: W) -> Self {
RawDocIndexesBuilder {
ranges: Vec::new(),
wtr: wtr,
}
}
pub fn insert(&mut self, indexes: &[DocIndex]) -> io::Result<()> {
let len = indexes.len() as u64;
let start = self.ranges.last().map(|r| r.start).unwrap_or(0);
let range = Range { start, end: start + len };
self.ranges.push(range);
// write the values
let indexes = unsafe { into_u8_slice(indexes) };
self.wtr.write_all(indexes)
}
pub fn finish(self) -> io::Result<()> {
self.into_inner().map(drop)
}
pub fn into_inner(mut self) -> io::Result<W> {
// write the ranges
let ranges = unsafe { into_u8_slice(self.ranges.as_slice()) };
self.wtr.write_all(ranges)?;
// write the length of the ranges
let len = ranges.len() as u64;
self.wtr.write_u64::<LittleEndian>(len)?;
Ok(self.wtr)
}
}
pub struct DocIndexesBuilder<W> {
keys: BTreeMap<String, u64>,
indexes: Vec<Vec<DocIndex>>,
@ -136,29 +185,27 @@ impl<W: Write> DocIndexesBuilder<W> {
}
pub fn finish(self) -> io::Result<()> {
self.into_inner().map(|_| ())
self.into_inner().map(drop)
}
pub fn into_inner(mut self) -> io::Result<W> {
for vec in &mut self.indexes {
vec.sort_unstable();
}
let (ranges, values) = into_sliced_ranges(self.indexes, self.number_docs);
// write values first
let slice = unsafe { into_u8_slice(values.as_slice()) };
self.wtr.write_all(slice)?;
// write ranges after
let slice = unsafe { into_u8_slice(ranges.as_slice()) };
self.wtr.write_all(slice)?;
// write the length of the ranges
let len = ranges.len() as u64;
// TODO check if this is correct
self.wtr.write_u64::<LittleEndian>(len)?;
unsafe {
// write Ranges first
let slice = into_u8_slice(ranges.as_slice());
self.wtr.write_all(slice)?;
// write Values after
let slice = into_u8_slice(values.as_slice());
self.wtr.write_all(slice)?;
}
self.wtr.flush()?;
Ok(self.wtr)

View file

@ -7,7 +7,7 @@ use std::sync::Arc;
use fst::raw::MmapReadOnly;
pub use self::doc_ids::{DocIds, DocIdsBuilder};
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder, RawDocIndexesBuilder};
#[derive(Clone)]
enum Data {
@ -19,6 +19,16 @@ enum Data {
Mmap(MmapReadOnly),
}
impl Default for Data {
fn default() -> Data {
Data::Shared {
vec: Arc::default(),
offset: 0,
len: 0,
}
}
}
impl Deref for Data {
type Target = [u8];