use std::sync::Arc; use std::ops::Deref; use std::error::Error; use std::path::Path; use std::collections::btree_map::{Entry, BTreeMap}; use std::slice::from_raw_parts; use std::io::{self, Write}; use std::mem; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use fst::{self, Map, MapBuilder, Automaton}; use fst::raw::MmapReadOnly; use crate::DocIndex; #[repr(C)] struct Range { start: u64, end: u64, } #[derive(Clone)] enum DocIndexesData { Shared { vec: Arc>, offset: usize, len: usize, }, Mmap(MmapReadOnly), } impl Deref for DocIndexesData { type Target = [u8]; fn deref(&self) -> &Self::Target { match self { DocIndexesData::Shared { vec, offset, len } => { &vec[*offset..offset + len] }, DocIndexesData::Mmap(m) => m.as_slice(), } } } #[derive(Clone)] pub struct DocIndexes { ranges: DocIndexesData, indexes: DocIndexesData, } impl DocIndexes { pub unsafe fn from_path>(path: P) -> io::Result { let mmap = MmapReadOnly::open_path(path)?; let range_len = mmap.as_slice().read_u64::()?; let range_len = range_len as usize * mem::size_of::(); let offset = mem::size_of::() as usize; let ranges = DocIndexesData::Mmap(mmap.range(offset, range_len)); let len = mmap.len() - range_len - offset; let offset = offset + range_len; let indexes = DocIndexesData::Mmap(mmap.range(offset, len)); Ok(DocIndexes { ranges, indexes }) } pub fn from_bytes(vec: Vec) -> io::Result { let vec = Arc::new(vec); let range_len = vec.as_slice().read_u64::()?; let range_len = range_len as usize * mem::size_of::(); let offset = mem::size_of::() as usize; let ranges = DocIndexesData::Shared { vec: vec.clone(), offset, len: range_len }; let len = vec.len() - range_len - offset; let offset = offset + range_len; let indexes = DocIndexesData::Shared { vec, offset, len }; Ok(DocIndexes { ranges, indexes }) } pub fn get(&self, index: u64) -> Option<&[DocIndex]> { self.ranges().get(index as usize).map(|Range { start, end }| { let start = *start as usize; let end = *end as usize; &self.indexes()[start..end] }) } fn ranges(&self) -> &[Range] { let slice = &self.ranges; let ptr = slice.as_ptr() as *const Range; let len = slice.len() / mem::size_of::(); unsafe { from_raw_parts(ptr, len) } } fn indexes(&self) -> &[DocIndex] { let slice = &self.indexes; let ptr = slice.as_ptr() as *const DocIndex; let len = slice.len() / mem::size_of::(); unsafe { from_raw_parts(ptr, len) } } } pub struct Metadata { map: Map, indexes: DocIndexes, } impl Metadata { pub unsafe fn from_paths(map: P, indexes: Q) -> Result> where P: AsRef, Q: AsRef, { let map = Map::from_path(map)?; let indexes = DocIndexes::from_path(indexes)?; Ok(Metadata::from_raw(map, indexes)) } pub fn from_bytes(map: Vec, indexes: Vec) -> Result> { let map = Map::from_bytes(map)?; let indexes = DocIndexes::from_bytes(indexes)?; Ok(Metadata::from_raw(map, indexes)) } pub fn from_raw(map: Map, indexes: DocIndexes) -> Self { Metadata { map, indexes } } pub fn get>(&self, key: K) -> Option<&[DocIndex]> { self.map.get(key).and_then(|index| self.indexes.get(index)) } pub fn as_map(&self) -> &Map { &self.map } pub fn as_indexes(&self) -> &DocIndexes { &self.indexes } pub fn explode(self) -> (Map, DocIndexes) { (self.map, self.indexes) } } pub struct Inner { keys: BTreeMap, indexes: Vec>, number_docs: usize, } impl Inner { pub fn new() -> Self { Inner { keys: BTreeMap::new(), indexes: Vec::new(), number_docs: 0, } } pub fn number_doc_indexes(&self) -> usize { self.number_docs } pub fn insert(&mut self, key: String, value: DocIndex) { match self.keys.entry(key) { Entry::Vacant(e) => { let index = self.indexes.len() as u64; self.indexes.push(vec![value]); e.insert(index); }, Entry::Occupied(e) => { let index = *e.get(); let vec = &mut self.indexes[index as usize]; vec.push(value); }, } self.number_docs += 1; } } pub struct MetadataBuilder { inner: Inner, map: W, indexes: X, } impl MetadataBuilder { pub fn new(map: W, indexes: X) -> Self { Self { inner: Inner::new(), map, indexes } } pub fn insert(&mut self, key: String, index: DocIndex) { self.inner.insert(key, index) } pub fn finish(self) -> Result<(), Box> { self.into_inner().map(|_| ()) } pub fn into_inner(mut self) -> Result<(W, X), Box> { let number_docs = self.inner.number_doc_indexes(); let mut keys_builder = MapBuilder::new(self.map)?; keys_builder.extend_iter(self.inner.keys)?; let map = keys_builder.into_inner()?; // write down doc_indexes into the indexes Writer let (ranges, values) = into_sliced_ranges(self.inner.indexes, number_docs); let len = ranges.len() as u64; // TODO check if this is correct self.indexes.write_u64::(len)?; unsafe { // write Ranges first let slice = into_u8_slice(ranges.as_slice()); self.indexes.write_all(slice)?; // write Values after let slice = into_u8_slice(values.as_slice()); self.indexes.write_all(slice)?; } self.indexes.flush()?; Ok((map, self.indexes)) } } fn into_sliced_ranges(vecs: Vec>, number_docs: usize) -> (Vec, Vec) { let cap = vecs.len(); let mut ranges = Vec::with_capacity(cap); let mut values = Vec::with_capacity(number_docs); for mut v in &vecs { let len = v.len() as u64; let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0); let range = Range { start, end: start + len }; ranges.push(range); } values.extend(vecs.into_iter().flatten()); (ranges, values) } unsafe fn into_u8_slice(slice: &[T]) -> &[u8] { let ptr = slice.as_ptr() as *const u8; let len = slice.len() * mem::size_of::(); from_raw_parts(ptr, len) } pub struct OpBuilder<'m, 'v> { inner: fst::map::OpBuilder<'m>, indexes: &'v DocIndexes, } impl<'m, 'v> OpBuilder<'m, 'v> { pub fn new(indexes: &'v DocIndexes) -> Self { Self { inner: fst::map::OpBuilder::new(), indexes: indexes, } } pub fn add(mut self, streamable: I) -> Self where I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>, { self.push(streamable); self } pub fn push(&mut self, streamable: I) where I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>, { self.inner.push(streamable); } pub fn union(self) -> Union<'m, 'v> { Union { inner: self.inner.union(), outs: Vec::new(), indexes: self.indexes, } } } #[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct IndexedValues<'a> { pub index: usize, pub values: &'a [DocIndex], } pub struct Union<'m, 'v> { inner: fst::map::Union<'m>, outs: Vec>, indexes: &'v DocIndexes, } impl<'a, 'm, 'v> fst::Streamer<'a> for Union<'m, 'v> { type Item = (&'a [u8], &'a [IndexedValues<'a>]); fn next(&'a mut self) -> Option { match self.inner.next() { Some((s, ivalues)) => { self.outs.clear(); self.outs.reserve(ivalues.len()); for ivalue in ivalues { if let Some(values) = self.indexes.get(ivalue.value) { let index = ivalue.index; self.outs.push(IndexedValues { index, values }) } } Some((s, &self.outs)) }, None => None, } } } pub struct StreamBuilder<'m, 'v, A> { inner: fst::map::StreamBuilder<'m, A>, indexes: &'v DocIndexes, } impl<'m, 'v, 'a, A: 'a> fst::IntoStreamer<'a> for StreamBuilder<'m, 'v, A> where A: Automaton, A::State: Clone, { type Item = >::Item; type Into = Stream<'m, 'v, A>; fn into_stream(self) -> Self::Into { Stream { inner: self.inner.into_stream(), indexes: self.indexes, } } } pub struct Stream<'m, 'v, A: Automaton = fst::automaton::AlwaysMatch> { inner: fst::map::Stream<'m, A>, indexes: &'v DocIndexes, } impl<'m, 'v, 'a, A: 'a> fst::Streamer<'a> for Stream<'m, 'v, A> where A: Automaton, { type Item = (&'a [u8], &'a [DocIndex]); fn next(&'a mut self) -> Option { match self.inner.next() { Some((key, i)) => { match self.indexes.get(i) { Some(values) => Some((key, values)), None => None, } }, None => None, } } } #[cfg(test)] mod tests { use super::*; #[test] fn empty_serialize_deserialize() { let mapw = Vec::new(); let indexesw = Vec::new(); let builder = MetadataBuilder::new(mapw, indexesw); let (map, indexes) = builder.into_inner().unwrap(); let metas = Metadata::from_bytes(map, indexes).unwrap(); assert_eq!(metas.get("chameau"), None); } #[test] fn one_doc_serialize_deserialize() { let mapw = Vec::new(); let indexesw = Vec::new(); let mut builder = MetadataBuilder::new(mapw, indexesw); let doc = DocIndex { document: 12, attribute: 1, attribute_index: 22 }; builder.insert("chameau".into(), doc); let (map, indexes) = builder.into_inner().unwrap(); let metas = Metadata::from_bytes(map, indexes).unwrap(); assert_eq!(metas.get("chameau"), Some(&[doc][..])); } #[test] fn multiple_docs_serialize_deserialize() { let mapw = Vec::new(); let indexesw = Vec::new(); let mut builder = MetadataBuilder::new(mapw, indexesw); let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 }; let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 }; builder.insert("chameau".into(), doc1); builder.insert("chameau".into(), doc2); let (map, indexes) = builder.into_inner().unwrap(); let metas = Metadata::from_bytes(map, indexes).unwrap(); assert_eq!(metas.get("chameau"), Some(&[doc1, doc2][..])); } }