Merge many MTBL into one a the same time

This commit is contained in:
Kerollmops 2020-06-01 18:39:58 +02:00
parent 6a047519f6
commit 5cc81a0179
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -100,22 +100,6 @@ impl MtblKvStore {
Ok(MtblKvStore(Some(out))) Ok(MtblKvStore(Some(out)))
} }
fn merge_with(self, other: MtblKvStore) -> anyhow::Result<MtblKvStore> {
eprintln!("{:?}: Merging two MTBL stores...", rayon::current_thread_index());
let (left, right) = match (self.0, other.0) {
(Some(left), Some(right)) => (left, right),
(Some(left), None) => return Ok(MtblKvStore(Some(left))),
(None, Some(right)) => return Ok(MtblKvStore(Some(right))),
(None, None) => return Ok(MtblKvStore(None)),
};
let left = unsafe { memmap::Mmap::map(&left)? };
let right = unsafe { memmap::Mmap::map(&right)? };
let left = Reader::new(&left, ReaderOptions::default()).unwrap();
let right = Reader::new(&right, ReaderOptions::default()).unwrap();
fn merge(key: &[u8], left: &[u8], right: &[u8]) -> Option<Vec<u8>> { fn merge(key: &[u8], left: &[u8], right: &[u8]) -> Option<Vec<u8>> {
if key == b"\0words-fst" { if key == b"\0words-fst" {
let left_fst = fst::Set::new(left).unwrap(); let left_fst = fst::Set::new(left).unwrap();
@ -152,11 +136,21 @@ impl MtblKvStore {
} }
} }
fn from_many(stores: Vec<MtblKvStore>) -> anyhow::Result<MtblKvStore> {
eprintln!("{:?}: Merging {} MTBL stores...", rayon::current_thread_index(), stores.len());
let mmaps: Vec<_> = stores.iter().flat_map(|m| {
m.0.as_ref().map(|f| unsafe { memmap::Mmap::map(f).unwrap() })
}).collect();
let sources = mmaps.iter().map(|mmap| {
Reader::new(&mmap, ReaderOptions::default()).unwrap()
}).collect();
let outfile = tempfile::tempfile()?; let outfile = tempfile::tempfile()?;
let mut out = Writer::new(outfile, None)?; let mut out = Writer::new(outfile, None)?;
let sources = vec![left, right]; let opt = MergerOptions { merge: MtblKvStore::merge };
let opt = MergerOptions { merge };
let mut merger = Merger::new(sources, opt); let mut merger = Merger::new(sources, opt);
let mut iter = merger.iter(); let mut iter = merger.iter();
@ -286,30 +280,18 @@ fn main() -> anyhow::Result<()> {
let index = Index::new(&env)?; let index = Index::new(&env)?;
let mut stores: Vec<_> = opt.files_to_index let stores: Vec<_> = opt.files_to_index
.into_par_iter() .into_par_iter()
.try_fold(MtblKvStore::default, |acc, path| { .map(|path| {
let rdr = csv::Reader::from_path(path)?; let rdr = csv::Reader::from_path(path)?;
let store = index_csv(rdr)?; index_csv(rdr)
acc.merge_with(store)
}) })
.inspect(|_| { .inspect(|_| {
eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed)) eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed))
}) })
.collect::<Result<_, _>>()?; .collect::<Result<_, _>>()?;
while stores.len() >= 1 { let mtbl_store = MtblKvStore::from_many(stores)?;
let s = std::mem::take(&mut stores);
stores = s.into_par_iter().chunks(2).map(|mut v| {
match (v.pop(), v.pop()) {
(Some(a), Some(b)) => a.merge_with(b),
(Some(a), _) => Ok(a),
_ => unreachable!(),
}
}).collect::<Result<_, _>>()?;
}
let mtbl_store = stores.pop().unwrap_or_default();
eprintln!("We are writing into LMDB..."); eprintln!("We are writing into LMDB...");
let mut wtxn = env.write_txn()?; let mut wtxn = env.write_txn()?;