From 3a23dc242e4bd1fbcb210c0f8a079c0e3c30a2e9 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 4 Jun 2020 16:17:24 +0200 Subject: [PATCH] More efficiently merge MTBLs, more than two at a time --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/bin/indexer.rs | 32 +++++++++++++++++--------------- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index be5640e22..69c627947 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -996,7 +996,7 @@ checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" [[package]] name = "oxidized-mtbl" version = "0.1.0" -source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=8918476#8918476f61f4430890d067db7b4a6cfb2d549c43" +source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=6acef3d#6acef3d0fc7fec6a3701038860e51f8bbcee1ee6" dependencies = [ "byteorder 1.3.4", "crc32c", diff --git a/Cargo.toml b/Cargo.toml index f3b09dc56..ce09f5e06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } memmap = "0.7.0" once_cell = "1.4.0" -oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "8918476" } +oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "6acef3d" } rayon = "1.3.0" roaring = "0.5.2" slice-group-by = "0.2.6" diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index bed058dfc..cd3ae66b1 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -100,36 +100,38 @@ impl MtblKvStore { Ok(MtblKvStore(Some(out))) } - fn merge(key: &[u8], left: &[u8], right: &[u8]) -> Option> { + fn merge(key: &[u8], values: &[Vec]) -> Option> { if key == b"\0words-fst" { - let left_fst = fst::Set::new(left).unwrap(); - let right_fst = fst::Set::new(right).unwrap(); + let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect(); // Union of the two FSTs - let op = fst::set::OpBuilder::new() - .add(left_fst.into_stream()) - .add(right_fst.into_stream()) - .r#union(); + let mut op = fst::set::OpBuilder::new(); + fsts.iter().for_each(|fst| op.push(fst.into_stream())); + let op = op.r#union(); let mut build = fst::SetBuilder::memory(); build.extend_stream(op.into_stream()).unwrap(); Some(build.into_inner().unwrap()) } else if key == b"\0headers" { - assert_eq!(left, right); - Some(left.to_vec()) + assert!(values.windows(2).all(|vs| vs[0] == vs[1])); + Some(values[0].to_vec()) } else if key.starts_with(&[1]) || key.starts_with(&[2]) { - let mut left = RoaringBitmap::deserialize_from(left).unwrap(); - let right = RoaringBitmap::deserialize_from(right).unwrap(); - left.union_with(&right); + let mut first = RoaringBitmap::deserialize_from(values[0].as_slice()).unwrap(); + + for value in &values[1..] { + let bitmap = RoaringBitmap::deserialize_from(value.as_slice()).unwrap(); + first.union_with(&bitmap); + } + let mut vec = Vec::new(); - left.serialize_into(&mut vec).unwrap(); + first.serialize_into(&mut vec).unwrap(); Some(vec) } else if key.starts_with(&[3]) { - assert_eq!(left, right); - Some(left.to_vec()) + assert!(values.windows(2).all(|vs| vs[0] == vs[1])); + Some(values[0].to_vec()) } else { panic!("wut? {:?}", key)