dump: Make the data less prone of memory indirections

This commit is contained in:
Clément Renault 2018-05-05 10:59:03 +02:00
parent d0919b2108
commit a20405f786
6 changed files with 204 additions and 170 deletions

View file

@ -13,7 +13,7 @@ use std::io::{BufReader, BufRead};
use fst::Streamer;
use serde_json::from_str;
use raptor::{MultiMapBuilder, MultiMap};
use raptor::{FstMapBuilder, FstMap};
#[derive(Debug, Deserialize)]
struct Product {
@ -42,7 +42,7 @@ fn main() {
set
};
let mut builder = MultiMapBuilder::new();
let mut builder = FstMapBuilder::new();
for line in data.lines() {
let line = line.unwrap();
@ -65,11 +65,6 @@ fn main() {
let values = File::create("values.vecs").unwrap();
let (map, values) = builder.build(map, values).unwrap();
// just to check if the dump is valid
let map = unsafe { MultiMap::from_paths("map.fst", "values.vecs").unwrap() };
// let mut stream = map.stream();
// while let Some(x) = stream.next() {
// println!("{:?}", x);
// }
eprintln!("Checking the dump consistency...");
unsafe { FstMap::<u64>::from_paths("map.fst", "values.vecs").unwrap() };
}

View file

@ -21,19 +21,19 @@ use tokio_minihttp::{Request, Response, Http};
use tokio_proto::TcpServer;
use tokio_service::Service;
use raptor::MultiMap;
use raptor::FstMap;
lazy_static! {
static ref MAP: MultiMap = {
static ref MAP: FstMap<u64> = {
let map = read_to_vec("map.fst").unwrap();
let values = read_to_vec("values.vecs").unwrap();
MultiMap::from_bytes(map, &values).unwrap()
FstMap::from_bytes(map, &values).unwrap()
};
}
struct MainService {
map: &'static MultiMap,
map: &'static FstMap<u64>,
}
impl Service for MainService {

161
src/fst_map.rs Normal file
View file

@ -0,0 +1,161 @@
use bincode;
use fst::{self, Map, MapBuilder, Automaton};
use serde::de::DeserializeOwned;
use serde::ser::Serialize;
use std::fs::File;
use std::io::{Write, BufReader};
use std::ops::{Range, Deref, DerefMut};
use std::path::Path;
use {StreamBuilder, Stream};
#[derive(Debug)]
pub struct FstMap<T> {
inner: Map,
values: Values<T>,
}
impl<T> FstMap<T> {
pub unsafe fn from_paths<P, Q>(map: P, values: Q) -> fst::Result<Self>
where
T: DeserializeOwned,
P: AsRef<Path>,
Q: AsRef<Path>
{
let inner = Map::from_path(map)?;
// TODO handle errors !!!
let values = File::open(values).unwrap();
let values = BufReader::new(values);
let values = bincode::deserialize_from(values).unwrap();
Ok(Self { inner, values })
}
pub fn from_bytes(map: Vec<u8>, values: &[u8]) -> fst::Result<Self>
where
T: DeserializeOwned
{
let inner = Map::from_bytes(map)?;
let values = bincode::deserialize(values).unwrap();
Ok(Self { inner, values })
}
pub fn stream(&self) -> Stream<T> {
Stream {
inner: self.inner.stream(),
values: &self.values,
}
}
pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> bool {
self.inner.contains_key(key)
}
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[T]> {
self.inner.get(key).map(|i| unsafe { self.values.get_unchecked(i as usize) })
}
pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<T, A> {
StreamBuilder {
inner: self.inner.search(aut),
values: &self.values,
}
}
}
#[derive(Debug, Serialize, Deserialize)]
pub struct Values<T> {
ranges: Box<[Range<u64>]>,
values: Box<[T]>,
}
impl<T> Values<T> {
fn new(raw: Vec<Vec<T>>) -> Self {
let cap = raw.len();
let mut ranges = Vec::with_capacity(cap);
let cap = raw.iter().map(Vec::len).sum();
let mut values = Vec::with_capacity(cap);
for v in &raw {
let len = v.len() as u64;
let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0);
let range = Range { start, end: start + len };
ranges.push(range);
}
values.extend(raw.into_iter().flat_map(IntoIterator::into_iter));
let ranges = ranges.into_boxed_slice();
let values = values.into_boxed_slice();
Self { ranges, values }
}
pub unsafe fn get_unchecked(&self, index: usize) -> &[T] {
let range = self.ranges.get_unchecked(index);
let range = Range { start: range.start as usize, end: range.end as usize };
self.values.get_unchecked(range)
}
}
#[derive(Debug)]
pub struct FstMapBuilder<T> {
map: Vec<(String, u64)>,
// This makes many memory indirections but it is only used
// at index time, not kept for query time.
values: Vec<Vec<T>>,
}
impl<T> FstMapBuilder<T> {
pub fn new() -> Self {
Self {
map: Vec::new(),
values: Vec::new(),
}
}
pub fn insert<S: Into<String>>(&mut self, key: S, value: T) {
let key = key.into();
match self.map.binary_search_by_key(&key.as_str(), |&(ref k, _)| k) {
Ok(index) => {
let (_, index) = self.map[index];
let values = &mut self.values[index as usize];
values.push(value);
},
Err(index) => {
self.values.push(vec![value]);
let values_index = (self.values.len() - 1) as u64;
let value = (key, values_index);
self.map.insert(index, value);
},
}
}
pub fn build_memory(self) -> fst::Result<FstMap<T>> {
Ok(FstMap {
inner: Map::from_iter(self.map)?,
values: Values::new(self.values),
})
}
pub fn build<W, X>(self, map_wrt: W, mut values_wrt: X) -> fst::Result<(W, X)>
where
T: Serialize,
W: Write,
X: Write
{
let mut builder = MapBuilder::new(map_wrt)?;
builder.extend_iter(self.map)?;
let map = builder.into_inner()?;
let values = Values::new(self.values);
// TODO handle that error !!!
bincode::serialize_into(&mut values_wrt, &values).unwrap();
Ok((map, values_wrt))
}
}

View file

@ -1,76 +1,26 @@
#[macro_use] extern crate serde_derive;
extern crate bincode;
extern crate fst;
extern crate smallvec;
extern crate serde;
use std::ops::{Deref, DerefMut};
mod fst_map;
use std::ops::{Range, Deref, DerefMut};
use std::io::{Write, BufReader};
use std::fs::File;
use std::path::Path;
use std::str::from_utf8_unchecked;
use fst::Automaton;
pub use fst::MapBuilder;
use smallvec::SmallVec;
pub use self::fst_map::{FstMap, FstMapBuilder};
use self::fst_map::Values;
type SmallVec32<T> = SmallVec<[T; 16]>;
#[derive(Debug)]
pub struct MultiMap {
map: fst::Map,
values: Box<[SmallVec32<u64>]>,
}
impl MultiMap {
pub unsafe fn from_paths<P, Q>(map: P, values: Q) -> fst::Result<MultiMap>
where
P: AsRef<Path>,
Q: AsRef<Path>
{
let map = fst::Map::from_path(map)?;
// TODO handle errors !!!
let values = File::open(values).unwrap();
let values = BufReader::new(values);
let values = bincode::deserialize_from(values).unwrap();
Ok(MultiMap { map, values })
}
pub fn from_bytes(map: Vec<u8>, values: &[u8]) -> fst::Result<MultiMap> {
let map = fst::Map::from_bytes(map)?;
let values = bincode::deserialize(values).unwrap();
Ok(MultiMap { map, values })
}
pub fn stream(&self) -> Stream {
Stream {
inner: self.map.stream(),
values: &self.values,
}
}
pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> bool {
self.map.contains_key(key)
}
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[u64]> {
self.map.get(key).map(|i| &*self.values[i as usize])
}
pub fn search<A: fst::Automaton>(&self, aut: A) -> StreamBuilder<A> {
StreamBuilder {
inner: self.map.search(aut),
values: &self.values,
}
}
}
pub struct StreamBuilder<'a, A: fst::Automaton> {
pub struct StreamBuilder<'a, T: 'a, A: Automaton> {
inner: fst::map::StreamBuilder<'a, A>,
values: &'a [SmallVec32<u64>],
values: &'a Values<T>,
}
impl<'a, A: fst::Automaton> Deref for StreamBuilder<'a, A> {
impl<'a, T, A: Automaton> Deref for StreamBuilder<'a, T, A> {
type Target = fst::map::StreamBuilder<'a, A>;
fn deref(&self) -> &Self::Target {
@ -78,16 +28,16 @@ impl<'a, A: fst::Automaton> Deref for StreamBuilder<'a, A> {
}
}
impl<'a, A: fst::Automaton> DerefMut for StreamBuilder<'a, A> {
impl<'a, T, A: Automaton> DerefMut for StreamBuilder<'a, T, A> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.inner
}
}
impl<'a, A: fst::Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, A> {
type Item = (&'a str, &'a [u64]);
impl<'a, T: 'a, A: Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, T, A> {
type Item = (&'a str, &'a [T]);
type Into = Stream<'a, A>;
type Into = Stream<'a, T, A>;
fn into_stream(self) -> Self::Into {
Stream {
@ -97,84 +47,23 @@ impl<'a, A: fst::Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, A> {
}
}
pub struct Stream<'a, A: fst::Automaton = fst::automaton::AlwaysMatch> {
pub struct Stream<'a, T: 'a, A: Automaton = fst::automaton::AlwaysMatch> {
inner: fst::map::Stream<'a, A>,
values: &'a [SmallVec32<u64>],
values: &'a Values<T>,
}
impl<'a, 'm, A: fst::Automaton> fst::Streamer<'a> for Stream<'m, A> {
type Item = (&'a str, &'a [u64]);
impl<'a, 'm, T: 'a, A: Automaton> fst::Streamer<'a> for Stream<'m, T, A> {
type Item = (&'a str, &'a [T]);
fn next(&'a mut self) -> Option<Self::Item> {
// Here we can't just `map` because of some borrow rules
match self.inner.next() {
Some((key, i)) => {
let key = unsafe { from_utf8_unchecked(key) };
Some((key, &*self.values[i as usize]))
let values = unsafe { self.values.get_unchecked(i as usize) };
Some((key, values))
},
None => None,
}
}
}
#[derive(Debug)]
pub struct MultiMapBuilder {
map: Vec<(String, u64)>,
values: Vec<SmallVec32<u64>>,
}
impl<'a> MultiMapBuilder {
pub fn new() -> MultiMapBuilder {
MultiMapBuilder {
map: Vec::new(),
values: Vec::new(),
}
}
pub fn insert<S: Into<String>>(&mut self, key: S, value: u64) {
let key = key.into();
match self.map.binary_search_by_key(&key.as_str(), |&(ref k, _)| k) {
Ok(index) => {
let (_, index) = self.map[index];
let values = &mut self.values[index as usize];
if let Err(index) = values.binary_search(&value) {
values.insert(index, value)
}
},
Err(index) => {
let values = {
let mut vec = SmallVec32::new();
vec.push(value);
vec
};
self.values.push(values);
let values_index = (self.values.len() - 1) as u64;
let value = (key, values_index);
self.map.insert(index, value);
},
}
}
pub fn build_memory(self) -> fst::Result<MultiMap> {
Ok(MultiMap {
map: fst::Map::from_iter(self.map)?,
values: self.values.into_boxed_slice(),
})
}
pub fn build<W, X>(self, map_wrt: W, mut values_wrt: X) -> fst::Result<(W, X)>
where
W: Write,
X: Write
{
let mut builder = MapBuilder::new(map_wrt)?;
builder.extend_iter(self.map)?;
let map = builder.into_inner()?;
// TODO handle that !!!
bincode::serialize_into(&mut values_wrt, &self.values).unwrap();
Ok((map, values_wrt))
}
}