From a20405f786387e8b31c606f5f371bfe1aecc7a26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 5 May 2018 10:59:03 +0200 Subject: [PATCH] dump: Make the data less prone of memory indirections --- Cargo.lock | 38 ++++------ Cargo.toml | 1 - src/bin/raptor-cli.rs | 13 ++-- src/bin/raptor.rs | 8 +-- src/fst_map.rs | 161 ++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 153 ++++++--------------------------------- 6 files changed, 204 insertions(+), 170 deletions(-) create mode 100644 src/fst_map.rs diff --git a/Cargo.lock b/Cargo.lock index 89272ee81..fe70e63ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12,7 +12,7 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.43 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -274,7 +274,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "proc-macro2" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -285,7 +285,7 @@ name = "quote" version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "proc-macro2 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -318,10 +318,9 @@ dependencies = [ "fst-levenshtein 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_derive 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.43 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.43 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.16 (registry+https://github.com/rust-lang/crates.io-index)", - "smallvec 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", "tokio-minihttp 0.1.0 (git+https://github.com/tokio-rs/tokio-minihttp.git)", "tokio-proto 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "tokio-service 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -345,15 +344,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "serde" -version = "1.0.42" +version = "1.0.43" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "serde_derive" -version = "1.0.42" +version = "1.0.43" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "proc-macro2 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", "quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive_internals 0.23.1 (registry+https://github.com/rust-lang/crates.io-index)", "syn 0.13.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -364,7 +363,7 @@ name = "serde_derive_internals" version = "0.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "proc-macro2 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", "syn 0.13.1 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -375,7 +374,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", "itoa 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.43 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -393,20 +392,12 @@ name = "smallvec" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "smallvec" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "serde 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "syn" version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "proc-macro2 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", "quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -689,21 +680,20 @@ dependencies = [ "checksum nodrop 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)" = "9a2228dca57108069a5262f2ed8bd2e82496d2e074a06d1ccc7ce1687b6ae0a2" "checksum num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c51a3322e4bca9d212ad9a158a02abc6934d005490c054a2778df73a70aa0a30" "checksum percent-encoding 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "31010dd2e1ac33d5b46a5b413495239882813e0369f8ed8a5e266f173602f831" -"checksum proc-macro2 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "49b6a521dc81b643e9a51e0d1cf05df46d5a2f3c0280ea72bcb68276ba64a118" +"checksum proc-macro2 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "b16749538926f394755373f0dfec0852d79b3bd512a5906ceaeb72ee64a4eaa0" "checksum quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9949cfe66888ffe1d53e6ec9d9f3b70714083854be20fd5e271b232a017401e8" "checksum rand 0.3.22 (registry+https://github.com/rust-lang/crates.io-index)" = "15a732abf9d20f0ad8eeb6f909bf6868722d9a06e1e50802b6a70351f40b4eb1" "checksum rand 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "eba5f8cb59cc50ed56be8880a5c7b496bfd9bd26394e176bc67884094145c2c5" "checksum redox_syscall 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)" = "0d92eecebad22b767915e4d529f89f28ee96dbbf5a4810d2b844373f136417fd" "checksum scoped-tls 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "8674d439c964889e2476f474a3bf198cc9e199e77499960893bac5de7e9218a4" "checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27" -"checksum serde 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)" = "a73973861352c932ed1365ce22b32467ce260ac4c8db11cf750ce56334ff2dcf" -"checksum serde_derive 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)" = "b392c5a0cebb98121454531c50e60e2ffe0fbeb1a44da277da2d681d08d7dc0b" +"checksum serde 1.0.43 (registry+https://github.com/rust-lang/crates.io-index)" = "0c855d888276f20d140223bd06515e5bf1647fd6d02593cb5792466d9a8ec2d0" +"checksum serde_derive 1.0.43 (registry+https://github.com/rust-lang/crates.io-index)" = "aa113e5fc4b008a626ba2bbd41330b56c9987d667f79f7b243e5a2d03d91ed1c" "checksum serde_derive_internals 0.23.1 (registry+https://github.com/rust-lang/crates.io-index)" = "9d30c4596450fd7bbda79ef15559683f9a79ac0193ea819db90000d7e1cae794" "checksum serde_json 1.0.16 (registry+https://github.com/rust-lang/crates.io-index)" = "8c6c4e049dc657a99e394bd85c22acbf97356feeec6dbf44150f2dcf79fb3118" "checksum slab 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "17b4fcaed89ab08ef143da37bc52adbcc04d4a69014f4c1208d6b51f0c47bc23" "checksum slab 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fdeff4cd9ecff59ec7e3744cbca73dfe5ac35c2aedb2cfba8a1c715a18912e9d" "checksum smallvec 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4c8cbcd6df1e117c2210e13ab5109635ad68a929fcbb8964dc965b76cb5ee013" -"checksum smallvec 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "44db0ecb22921ef790d17ae13a3f6d15784183ff5f2a01aa32098c7498d2b4b9" "checksum syn 0.13.1 (registry+https://github.com/rust-lang/crates.io-index)" = "91b52877572087400e83d24b9178488541e3d535259e04ff17a63df1e5ceff59" "checksum take 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b157868d8ac1f56b64604539990685fa7611d8fa9e5476cf0c02cf34d32917c5" "checksum time 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)" = "a15375f1df02096fb3317256ce2cee6a1f42fc84ea5ad5fc8c421cfe40c73098" diff --git a/Cargo.toml b/Cargo.toml index b9ffc60b6..b70ef51ae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,6 @@ lazy_static = "1.0" serde = "1.0" serde_derive = "1.0" serde_json = "1.0" -smallvec = { version = "0.6", features = ["serde"] } tokio-minihttp = { git = "https://github.com/tokio-rs/tokio-minihttp.git" } tokio-proto = "0.1" tokio-service = "0.1" diff --git a/src/bin/raptor-cli.rs b/src/bin/raptor-cli.rs index 6082243b3..430fe877c 100644 --- a/src/bin/raptor-cli.rs +++ b/src/bin/raptor-cli.rs @@ -13,7 +13,7 @@ use std::io::{BufReader, BufRead}; use fst::Streamer; use serde_json::from_str; -use raptor::{MultiMapBuilder, MultiMap}; +use raptor::{FstMapBuilder, FstMap}; #[derive(Debug, Deserialize)] struct Product { @@ -42,7 +42,7 @@ fn main() { set }; - let mut builder = MultiMapBuilder::new(); + let mut builder = FstMapBuilder::new(); for line in data.lines() { let line = line.unwrap(); @@ -65,11 +65,6 @@ fn main() { let values = File::create("values.vecs").unwrap(); let (map, values) = builder.build(map, values).unwrap(); - // just to check if the dump is valid - let map = unsafe { MultiMap::from_paths("map.fst", "values.vecs").unwrap() }; - - // let mut stream = map.stream(); - // while let Some(x) = stream.next() { - // println!("{:?}", x); - // } + eprintln!("Checking the dump consistency..."); + unsafe { FstMap::::from_paths("map.fst", "values.vecs").unwrap() }; } diff --git a/src/bin/raptor.rs b/src/bin/raptor.rs index 125bb90a7..86f7aff7a 100644 --- a/src/bin/raptor.rs +++ b/src/bin/raptor.rs @@ -21,19 +21,19 @@ use tokio_minihttp::{Request, Response, Http}; use tokio_proto::TcpServer; use tokio_service::Service; -use raptor::MultiMap; +use raptor::FstMap; lazy_static! { - static ref MAP: MultiMap = { + static ref MAP: FstMap = { let map = read_to_vec("map.fst").unwrap(); let values = read_to_vec("values.vecs").unwrap(); - MultiMap::from_bytes(map, &values).unwrap() + FstMap::from_bytes(map, &values).unwrap() }; } struct MainService { - map: &'static MultiMap, + map: &'static FstMap, } impl Service for MainService { diff --git a/src/fst_map.rs b/src/fst_map.rs new file mode 100644 index 000000000..3ffda0be8 --- /dev/null +++ b/src/fst_map.rs @@ -0,0 +1,161 @@ +use bincode; +use fst::{self, Map, MapBuilder, Automaton}; +use serde::de::DeserializeOwned; +use serde::ser::Serialize; +use std::fs::File; +use std::io::{Write, BufReader}; +use std::ops::{Range, Deref, DerefMut}; +use std::path::Path; +use {StreamBuilder, Stream}; + +#[derive(Debug)] +pub struct FstMap { + inner: Map, + values: Values, +} + +impl FstMap { + pub unsafe fn from_paths(map: P, values: Q) -> fst::Result + where + T: DeserializeOwned, + P: AsRef, + Q: AsRef + { + let inner = Map::from_path(map)?; + + // TODO handle errors !!! + let values = File::open(values).unwrap(); + let values = BufReader::new(values); + let values = bincode::deserialize_from(values).unwrap(); + + Ok(Self { inner, values }) + } + + pub fn from_bytes(map: Vec, values: &[u8]) -> fst::Result + where + T: DeserializeOwned + { + let inner = Map::from_bytes(map)?; + let values = bincode::deserialize(values).unwrap(); + + Ok(Self { inner, values }) + } + + pub fn stream(&self) -> Stream { + Stream { + inner: self.inner.stream(), + values: &self.values, + } + } + + pub fn contains_key>(&self, key: K) -> bool { + self.inner.contains_key(key) + } + + pub fn get>(&self, key: K) -> Option<&[T]> { + self.inner.get(key).map(|i| unsafe { self.values.get_unchecked(i as usize) }) + } + + pub fn search(&self, aut: A) -> StreamBuilder { + StreamBuilder { + inner: self.inner.search(aut), + values: &self.values, + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct Values { + ranges: Box<[Range]>, + values: Box<[T]>, +} + +impl Values { + fn new(raw: Vec>) -> Self { + let cap = raw.len(); + let mut ranges = Vec::with_capacity(cap); + let cap = raw.iter().map(Vec::len).sum(); + let mut values = Vec::with_capacity(cap); + + for v in &raw { + let len = v.len() as u64; + let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0); + + let range = Range { start, end: start + len }; + ranges.push(range); + } + + values.extend(raw.into_iter().flat_map(IntoIterator::into_iter)); + + let ranges = ranges.into_boxed_slice(); + let values = values.into_boxed_slice(); + + Self { ranges, values } + } + + pub unsafe fn get_unchecked(&self, index: usize) -> &[T] { + let range = self.ranges.get_unchecked(index); + let range = Range { start: range.start as usize, end: range.end as usize }; + self.values.get_unchecked(range) + } +} + +#[derive(Debug)] +pub struct FstMapBuilder { + map: Vec<(String, u64)>, + // This makes many memory indirections but it is only used + // at index time, not kept for query time. + values: Vec>, +} + +impl FstMapBuilder { + pub fn new() -> Self { + Self { + map: Vec::new(), + values: Vec::new(), + } + } + + pub fn insert>(&mut self, key: S, value: T) { + let key = key.into(); + match self.map.binary_search_by_key(&key.as_str(), |&(ref k, _)| k) { + Ok(index) => { + let (_, index) = self.map[index]; + let values = &mut self.values[index as usize]; + + values.push(value); + }, + Err(index) => { + self.values.push(vec![value]); + let values_index = (self.values.len() - 1) as u64; + + let value = (key, values_index); + self.map.insert(index, value); + }, + } + } + + pub fn build_memory(self) -> fst::Result> { + Ok(FstMap { + inner: Map::from_iter(self.map)?, + values: Values::new(self.values), + }) + } + + pub fn build(self, map_wrt: W, mut values_wrt: X) -> fst::Result<(W, X)> + where + T: Serialize, + W: Write, + X: Write + { + let mut builder = MapBuilder::new(map_wrt)?; + builder.extend_iter(self.map)?; + let map = builder.into_inner()?; + let values = Values::new(self.values); + + // TODO handle that error !!! + bincode::serialize_into(&mut values_wrt, &values).unwrap(); + + Ok((map, values_wrt)) + } +} diff --git a/src/lib.rs b/src/lib.rs index 78e2a8d80..80f34184c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,76 +1,26 @@ +#[macro_use] extern crate serde_derive; extern crate bincode; extern crate fst; -extern crate smallvec; +extern crate serde; -use std::ops::{Deref, DerefMut}; +mod fst_map; + +use std::ops::{Range, Deref, DerefMut}; use std::io::{Write, BufReader}; use std::fs::File; use std::path::Path; use std::str::from_utf8_unchecked; +use fst::Automaton; -pub use fst::MapBuilder; -use smallvec::SmallVec; +pub use self::fst_map::{FstMap, FstMapBuilder}; +use self::fst_map::Values; -type SmallVec32 = SmallVec<[T; 16]>; - -#[derive(Debug)] -pub struct MultiMap { - map: fst::Map, - values: Box<[SmallVec32]>, -} - -impl MultiMap { - pub unsafe fn from_paths(map: P, values: Q) -> fst::Result - where - P: AsRef, - Q: AsRef - { - let map = fst::Map::from_path(map)?; - - // TODO handle errors !!! - let values = File::open(values).unwrap(); - let values = BufReader::new(values); - let values = bincode::deserialize_from(values).unwrap(); - - Ok(MultiMap { map, values }) - } - - pub fn from_bytes(map: Vec, values: &[u8]) -> fst::Result { - let map = fst::Map::from_bytes(map)?; - let values = bincode::deserialize(values).unwrap(); - - Ok(MultiMap { map, values }) - } - - pub fn stream(&self) -> Stream { - Stream { - inner: self.map.stream(), - values: &self.values, - } - } - - pub fn contains_key>(&self, key: K) -> bool { - self.map.contains_key(key) - } - - pub fn get>(&self, key: K) -> Option<&[u64]> { - self.map.get(key).map(|i| &*self.values[i as usize]) - } - - pub fn search(&self, aut: A) -> StreamBuilder { - StreamBuilder { - inner: self.map.search(aut), - values: &self.values, - } - } -} - -pub struct StreamBuilder<'a, A: fst::Automaton> { +pub struct StreamBuilder<'a, T: 'a, A: Automaton> { inner: fst::map::StreamBuilder<'a, A>, - values: &'a [SmallVec32], + values: &'a Values, } -impl<'a, A: fst::Automaton> Deref for StreamBuilder<'a, A> { +impl<'a, T, A: Automaton> Deref for StreamBuilder<'a, T, A> { type Target = fst::map::StreamBuilder<'a, A>; fn deref(&self) -> &Self::Target { @@ -78,16 +28,16 @@ impl<'a, A: fst::Automaton> Deref for StreamBuilder<'a, A> { } } -impl<'a, A: fst::Automaton> DerefMut for StreamBuilder<'a, A> { +impl<'a, T, A: Automaton> DerefMut for StreamBuilder<'a, T, A> { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.inner } } -impl<'a, A: fst::Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, A> { - type Item = (&'a str, &'a [u64]); +impl<'a, T: 'a, A: Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, T, A> { + type Item = (&'a str, &'a [T]); - type Into = Stream<'a, A>; + type Into = Stream<'a, T, A>; fn into_stream(self) -> Self::Into { Stream { @@ -97,84 +47,23 @@ impl<'a, A: fst::Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, A> { } } -pub struct Stream<'a, A: fst::Automaton = fst::automaton::AlwaysMatch> { +pub struct Stream<'a, T: 'a, A: Automaton = fst::automaton::AlwaysMatch> { inner: fst::map::Stream<'a, A>, - values: &'a [SmallVec32], + values: &'a Values, } -impl<'a, 'm, A: fst::Automaton> fst::Streamer<'a> for Stream<'m, A> { - type Item = (&'a str, &'a [u64]); +impl<'a, 'm, T: 'a, A: Automaton> fst::Streamer<'a> for Stream<'m, T, A> { + type Item = (&'a str, &'a [T]); fn next(&'a mut self) -> Option { // Here we can't just `map` because of some borrow rules match self.inner.next() { Some((key, i)) => { let key = unsafe { from_utf8_unchecked(key) }; - Some((key, &*self.values[i as usize])) + let values = unsafe { self.values.get_unchecked(i as usize) }; + Some((key, values)) }, None => None, } } } - -#[derive(Debug)] -pub struct MultiMapBuilder { - map: Vec<(String, u64)>, - values: Vec>, -} - -impl<'a> MultiMapBuilder { - pub fn new() -> MultiMapBuilder { - MultiMapBuilder { - map: Vec::new(), - values: Vec::new(), - } - } - - pub fn insert>(&mut self, key: S, value: u64) { - let key = key.into(); - match self.map.binary_search_by_key(&key.as_str(), |&(ref k, _)| k) { - Ok(index) => { - let (_, index) = self.map[index]; - let values = &mut self.values[index as usize]; - if let Err(index) = values.binary_search(&value) { - values.insert(index, value) - } - }, - Err(index) => { - let values = { - let mut vec = SmallVec32::new(); - vec.push(value); - vec - }; - self.values.push(values); - let values_index = (self.values.len() - 1) as u64; - - let value = (key, values_index); - self.map.insert(index, value); - }, - } - } - - pub fn build_memory(self) -> fst::Result { - Ok(MultiMap { - map: fst::Map::from_iter(self.map)?, - values: self.values.into_boxed_slice(), - }) - } - - pub fn build(self, map_wrt: W, mut values_wrt: X) -> fst::Result<(W, X)> - where - W: Write, - X: Write - { - let mut builder = MapBuilder::new(map_wrt)?; - builder.extend_iter(self.map)?; - let map = builder.into_inner()?; - - // TODO handle that !!! - bincode::serialize_into(&mut values_wrt, &self.values).unwrap(); - - Ok((map, values_wrt)) - } -}