chore: Rework the data module structures

being able to be constructed from SharedData
This commit is contained in:
Clément Renault 2018-12-31 18:33:59 +01:00
parent c022fa3fca
commit 64d53ee1bd
No known key found for this signature in database
GPG key ID: 0151CDAB43460DAE
15 changed files with 292 additions and 262 deletions

View file

@ -1,62 +1,53 @@
use std::io::{self, Cursor, BufRead};
use std::slice::from_raw_parts;
use std::sync::Arc;
use std::{io, mem};
use std::mem::size_of;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use sdset::Set;
use crate::DocumentId;
use crate::data::SharedData;
use super::into_u8_slice;
#[derive(Default, Clone)]
pub struct DocIds {
data: SharedData,
}
pub struct DocIds(SharedData);
impl DocIds {
pub fn empty() -> Self {
DocIds { data: SharedData::empty() }
pub fn new(ids: &Set<DocumentId>) -> DocIds {
let bytes = unsafe { into_u8_slice(ids.as_slice()) };
let data = SharedData::from_bytes(bytes.to_vec());
DocIds(data)
}
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
let len = vec.len();
DocIds::from_shared_bytes(Arc::new(vec), 0, len)
}
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIds> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let offset = cursor.position() as usize;
let doc_ids = cursor.get_ref().range(offset, len);
cursor.consume(len);
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> io::Result<Self> {
let data = SharedData { bytes, offset, len };
DocIds::from_data(data)
}
pub fn as_bytes(&self) -> &[u8] {
&self.data
}
fn from_data(data: SharedData) -> io::Result<Self> {
let len = data.as_ref().read_u64::<LittleEndian>()?;
let data = data.range(mem::size_of::<u64>(), len as usize);
Ok(DocIds { data })
}
pub fn from_raw(vec: Vec<DocumentId>) -> Self {
DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap()
Ok(DocIds(doc_ids))
}
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let len = self.data.len() as u64;
let len = self.0.len() as u64;
bytes.write_u64::<LittleEndian>(len).unwrap();
bytes.extend_from_slice(&self.data);
bytes.extend_from_slice(&self.0);
}
pub fn contains(&self, doc: DocumentId) -> bool {
// FIXME prefer using the sdset::exponential_search function
self.doc_ids().binary_search(&doc).is_ok()
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub fn doc_ids(&self) -> &Set<DocumentId> {
let slice = &self.data;
pub fn as_bytes(&self) -> &[u8] {
&self.0
}
}
impl AsRef<Set<DocumentId>> for DocIds {
fn as_ref(&self) -> &Set<DocumentId> {
let slice = &self.0;
let ptr = slice.as_ptr() as *const DocumentId;
let len = slice.len() / mem::size_of::<DocumentId>();
let len = slice.len() / size_of::<DocumentId>();
let slice = unsafe { from_raw_parts(ptr, len) };
Set::new_unchecked(slice)
}

View file

@ -1,5 +1,5 @@
use std::io::{self, Write, Cursor, BufRead};
use std::slice::from_raw_parts;
use std::io::{self, Write};
use std::mem::size_of;
use std::ops::Index;
use std::sync::Arc;
@ -9,6 +9,7 @@ use sdset::Set;
use crate::DocIndex;
use crate::data::SharedData;
use super::into_u8_slice;
#[derive(Debug)]
#[repr(C)]
@ -24,40 +25,36 @@ pub struct DocIndexes {
}
impl DocIndexes {
pub fn from_bytes(vec: Vec<u8>) -> io::Result<DocIndexes> {
let len = vec.len();
DocIndexes::from_shared_bytes(Arc::new(vec), 0, len)
pub fn from_bytes(bytes: Vec<u8>) -> io::Result<DocIndexes> {
let bytes = Arc::new(bytes);
let len = bytes.len();
let data = SharedData::new(bytes, 0, len);
let mut cursor = Cursor::new(data);
DocIndexes::from_cursor(&mut cursor)
}
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> io::Result<DocIndexes> {
let data = SharedData { bytes, offset, len };
DocIndexes::from_data(data)
}
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIndexes> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let offset = cursor.position() as usize;
let ranges = cursor.get_ref().range(offset, len);
cursor.consume(len);
fn from_data(data: SharedData) -> io::Result<DocIndexes> {
let ranges_len_offset = data.len() - size_of::<u64>();
let ranges_len = (&data[ranges_len_offset..]).read_u64::<LittleEndian>()?;
let ranges_len = ranges_len as usize;
let ranges_offset = ranges_len_offset - ranges_len;
let ranges = data.range(ranges_offset, ranges_len);
let indexes = data.range(0, ranges_offset);
let len = cursor.read_u64::<LittleEndian>()? as usize;
let offset = cursor.position() as usize;
let indexes = cursor.get_ref().range(offset, len);
cursor.consume(len);
Ok(DocIndexes { ranges, indexes })
}
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let ranges_len = self.ranges.len() as u64;
let indexes_len = self.indexes.len() as u64;
let u64_size = size_of::<u64>() as u64;
let len = indexes_len + ranges_len + u64_size;
let _ = bytes.write_u64::<LittleEndian>(len);
bytes.extend_from_slice(&self.indexes);
bytes.extend_from_slice(&self.ranges);
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
bytes.extend_from_slice(&self.ranges);
let indexes_len = self.indexes.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
bytes.extend_from_slice(&self.indexes);
}
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
@ -97,12 +94,17 @@ impl Index<usize> for DocIndexes {
pub struct DocIndexesBuilder<W> {
ranges: Vec<Range>,
indexes: Vec<DocIndex>,
wtr: W,
}
impl DocIndexesBuilder<Vec<u8>> {
pub fn memory() -> Self {
DocIndexesBuilder::new(Vec::new())
DocIndexesBuilder {
ranges: Vec::new(),
indexes: Vec::new(),
wtr: Vec::new(),
}
}
}
@ -110,19 +112,18 @@ impl<W: Write> DocIndexesBuilder<W> {
pub fn new(wtr: W) -> Self {
DocIndexesBuilder {
ranges: Vec::new(),
indexes: Vec::new(),
wtr: wtr,
}
}
pub fn insert(&mut self, indexes: &Set<DocIndex>) -> io::Result<()> {
pub fn insert(&mut self, indexes: &Set<DocIndex>) {
let len = indexes.len() as u64;
let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
let range = Range { start, end: start + len };
self.ranges.push(range);
// write the values
let indexes = unsafe { into_u8_slice(indexes) };
self.wtr.write_all(indexes)
self.indexes.extend_from_slice(indexes);
}
pub fn finish(self) -> io::Result<()> {
@ -130,24 +131,20 @@ impl<W: Write> DocIndexesBuilder<W> {
}
pub fn into_inner(mut self) -> io::Result<W> {
// write the ranges
let ranges = unsafe { into_u8_slice(self.ranges.as_slice()) };
self.wtr.write_all(ranges)?;
// write the length of the ranges
let ranges = unsafe { into_u8_slice(&self.ranges) };
let len = ranges.len() as u64;
self.wtr.write_u64::<LittleEndian>(len)?;
self.wtr.write_all(ranges)?;
let indexes = unsafe { into_u8_slice(&self.indexes) };
let len = indexes.len() as u64;
self.wtr.write_u64::<LittleEndian>(len)?;
self.wtr.write_all(indexes)?;
Ok(self.wtr)
}
}
unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
let ptr = slice.as_ptr() as *const u8;
let len = slice.len() * size_of::<T>();
from_raw_parts(ptr, len)
}
#[cfg(test)]
mod tests {
use super::*;
@ -177,9 +174,9 @@ mod tests {
let mut builder = DocIndexesBuilder::memory();
builder.insert(Set::new(&[a])?)?;
builder.insert(Set::new(&[a, b, c])?)?;
builder.insert(Set::new(&[a, c])?)?;
builder.insert(Set::new(&[a])?);
builder.insert(Set::new(&[a, b, c])?);
builder.insert(Set::new(&[a, c])?);
let bytes = builder.into_inner()?;
let docs = DocIndexes::from_bytes(bytes)?;
@ -212,18 +209,17 @@ mod tests {
let mut builder = DocIndexesBuilder::memory();
builder.insert(Set::new(&[a])?)?;
builder.insert(Set::new(&[a, b, c])?)?;
builder.insert(Set::new(&[a, c])?)?;
builder.insert(Set::new(&[a])?);
builder.insert(Set::new(&[a, b, c])?);
builder.insert(Set::new(&[a, c])?);
let builder_bytes = builder.into_inner()?;
let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
let mut bytes = Vec::new();
docs.write_to_bytes(&mut bytes);
let len = size_of::<u64>();
assert_eq!(builder_bytes, &bytes[len..]);
assert_eq!(builder_bytes, bytes);
Ok(())
}

View file

@ -1,26 +1,30 @@
mod doc_ids;
mod doc_indexes;
use std::slice::from_raw_parts;
use std::mem::size_of;
use std::ops::Deref;
use std::sync::Arc;
pub use self::doc_ids::DocIds;
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
#[derive(Clone)]
struct SharedData {
bytes: Arc<Vec<u8>>,
offset: usize,
len: usize,
#[derive(Default, Clone)]
pub struct SharedData {
pub bytes: Arc<Vec<u8>>,
pub offset: usize,
pub len: usize,
}
impl SharedData {
pub fn empty() -> SharedData {
SharedData {
bytes: Arc::default(),
offset: 0,
len: 0,
}
pub fn from_bytes(vec: Vec<u8>) -> SharedData {
let len = vec.len();
let bytes = Arc::new(vec);
SharedData::new(bytes, 0, len)
}
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
SharedData { bytes, offset, len }
}
pub fn range(&self, offset: usize, len: usize) -> SharedData {
@ -33,12 +37,6 @@ impl SharedData {
}
}
impl Default for SharedData {
fn default() -> SharedData {
SharedData::empty()
}
}
impl Deref for SharedData {
type Target = [u8];
@ -52,3 +50,9 @@ impl AsRef<[u8]> for SharedData {
&self.bytes[self.offset..self.offset + self.len]
}
}
unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
let ptr = slice.as_ptr() as *const u8;
let len = slice.len() * size_of::<T>();
from_raw_parts(ptr, len)
}