From 55a8941922bbdacb78a71ab2ed37020c7f9e7e55 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Jun 2020 18:37:57 +0200 Subject: [PATCH] Optimize things --- Cargo.lock | 31 ++++++++----------- Cargo.toml | 5 ++- benches/search.rs | 25 +++++++++++++++ src/lib.rs | 78 ++++++++++++++++++++++++++--------------------- 4 files changed, 85 insertions(+), 54 deletions(-) create mode 100644 benches/search.rs diff --git a/Cargo.lock b/Cargo.lock index 6129fd497..9c34e3a1e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -48,7 +48,7 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5753e2a71534719bf3f4e57006c3a4f0d2c672a4b676eec84161f763eca87dbf" dependencies = [ - "byteorder 1.3.4", + "byteorder", "serde", ] @@ -75,7 +75,7 @@ checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b" dependencies = [ "block-padding", "byte-tools", - "byteorder 1.3.4", + "byteorder", "generic-array", ] @@ -116,12 +116,6 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" -[[package]] -name = "byteorder" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855" - [[package]] name = "byteorder" version = "1.3.4" @@ -445,7 +439,7 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" dependencies = [ - "byteorder 1.3.4", + "byteorder", ] [[package]] @@ -533,7 +527,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd7882b766b4be1b90d8ce5ce4c7aca2539b43176a708dbc8e79576dbbdbba93" dependencies = [ - "byteorder 1.3.4", + "byteorder", "heed-traits", "heed-types", "libc", @@ -784,7 +778,7 @@ version = "0.1.0" dependencies = [ "anyhow", "bitpacking", - "byteorder 1.3.4", + "byteorder", "cow-utils", "csv", "fst", @@ -1000,7 +994,7 @@ name = "oxidized-mtbl" version = "0.1.0" source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=9451be8#9451be8829562f7d1f8d34aa3ecb81c5106a0623" dependencies = [ - "byteorder 1.3.4", + "byteorder", "crc32c", "flate2", "snap", @@ -1360,7 +1354,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" dependencies = [ - "byteorder 1.3.4", + "byteorder", ] [[package]] @@ -1374,11 +1368,10 @@ dependencies = [ [[package]] name = "roaring" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4af20e5d3e44732a57489fa297768ca29361b54fbc3b20cdeb738fa6932cc22d" +version = "0.6.0" +source = "git+https://github.com/Kerollmops/roaring-rs.git?branch=deserialize-from-slice#24420bb9f980749476cec860ea8dd3c1683c0cd1" dependencies = [ - "byteorder 0.5.3", + "byteorder", ] [[package]] @@ -1693,7 +1686,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfea31758bf674f990918962e8e5f07071a3161bd7c4138ed23e416e1ac4264e" dependencies = [ "base64 0.11.0", - "byteorder 1.3.4", + "byteorder", "bytes", "http", "httparse", @@ -1901,7 +1894,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46" dependencies = [ - "byteorder 1.3.4", + "byteorder", "zerocopy-derive", ] diff --git a/Cargo.toml b/Cargo.toml index 168a9fb09..463994d37 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ memmap = "0.7.0" once_cell = "1.4.0" oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "9451be8" } rayon = "1.3.0" -roaring = "0.5.2" +roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "deserialize-from-slice" } slice-group-by = "0.2.6" smallstr = "0.2.0" smallvec = "1.4.0" @@ -40,3 +40,6 @@ warp = "0.2.2" [profile.release] debug = true + +[profile.bench] +debug = true diff --git a/benches/search.rs b/benches/search.rs new file mode 100644 index 000000000..3c2d9645f --- /dev/null +++ b/benches/search.rs @@ -0,0 +1,25 @@ +#![feature(test)] +extern crate test; + +use heed::EnvOpenOptions; +use mega_mini_indexer::Index; + +#[bench] +fn search_minogue_kylie_live(b: &mut test::Bencher) { + let database = "books-4cpu.mmdb"; + let query = "minogue kylie live"; + + std::fs::create_dir_all(database).unwrap(); + let env = EnvOpenOptions::new() + .map_size(100 * 1024 * 1024 * 1024) // 100 GB + .max_readers(10) + .max_dbs(5) + .open(database).unwrap(); + + let index = Index::new(&env).unwrap(); + + b.iter(|| { + let rtxn = env.read_txn().unwrap(); + let _documents_ids = index.search(&rtxn, query).unwrap(); + }) +} diff --git a/src/lib.rs b/src/lib.rs index f64082b4d..621feee09 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -104,7 +104,7 @@ impl Index { while let Some(word) = stream.next() { let word = std::str::from_utf8(word)?; if let Some(attrs) = self.postings_attrs.get(rtxn, word)? { - let right = RoaringBitmap::deserialize_from(attrs)?; + let right = RoaringBitmap::deserialize_from_slice(attrs)?; union_positions.union_with(&right); derived_words.push((word.as_bytes().to_vec(), right)); count += 1; @@ -120,57 +120,67 @@ impl Index { let mut documents = Vec::new(); - let mut debug_intersects = HashMap::new(); + // let mut debug_intersects = HashMap::new(); let mut intersect_cache = HashMap::new(); + let mut lunion_docids = RoaringBitmap::default(); + let mut runion_docids = RoaringBitmap::default(); let contains_documents = |(lword, lpos): (usize, u32), (rword, rpos): (usize, u32)| { let proximity = best_proximity::positions_proximity(lpos, rpos); *intersect_cache.entry(((lword, lpos), (rword, rpos))).or_insert_with(|| { - use std::iter::once; - - let (nb_words, nb_docs_intersect, lnblookups, lnbbitmaps, rnblookups, rnbbitmaps) = - debug_intersects.entry((lword, lpos, rword, rpos, proximity)).or_default(); + // let (nb_words, nb_docs_intersect, lnblookups, lnbbitmaps, rnblookups, rnbbitmaps) = + // debug_intersects.entry((lword, lpos, rword, rpos, proximity)).or_default(); let left = (&words[lword], lpos); let right = (&words[rword], rpos); - *nb_words = left.0.len() + right.0.len(); + // *nb_words = left.0.len() + right.0.len(); let mut l_lookups = 0; let mut l_bitmaps = 0; let mut r_lookups = 0; let mut r_bitmaps = 0; - let mut intersect_docids: Option = None; - for (i, (derived_words, pos))in once(left).chain(once(right)).enumerate() { - let mut union_docids = RoaringBitmap::default(); - // TODO re-enable the prefixes system - for (word, attrs) in derived_words.iter() { - if attrs.contains(pos) { - if i == 0 { l_lookups += 1 } else { r_lookups += 1 } - let mut key = word.clone(); - key.extend_from_slice(&pos.to_be_bytes()); - if let Some(attrs) = self.postings_ids.get(rtxn, &key).unwrap() { - if i == 0 { l_bitmaps += 1 } else { r_bitmaps += 1 } - let right = RoaringBitmap::deserialize_from(attrs).unwrap(); - union_docids.union_with(&right); - } + // This for the left word + lunion_docids.clear(); + for (word, attrs) in &words[lword] { + if attrs.contains(lpos) { + l_lookups += 1; + let mut key = word.clone(); + key.extend_from_slice(&lpos.to_be_bytes()); + if let Some(attrs) = self.postings_ids.get(rtxn, &key).unwrap() { + l_bitmaps += 1; + let right = RoaringBitmap::deserialize_from_slice(attrs).unwrap(); + lunion_docids.union_with(&right); } } - - match &mut intersect_docids { - Some(left) => left.intersect_with(&union_docids), - None => intersect_docids = Some(union_docids), - } } - *lnblookups = l_lookups; - *lnbbitmaps = l_bitmaps; - *rnblookups = r_lookups; - *rnbbitmaps = r_bitmaps; - *nb_docs_intersect += intersect_docids.as_ref().map_or(0, |i| i.len()); + // This for the right word + runion_docids.clear(); + for (word, attrs) in &words[rword] { + if attrs.contains(rpos) { + r_lookups += 1; + let mut key = word.clone(); + key.extend_from_slice(&rpos.to_be_bytes()); + if let Some(attrs) = self.postings_ids.get(rtxn, &key).unwrap() { + r_bitmaps += 1; + let right = RoaringBitmap::deserialize_from_slice(attrs).unwrap(); + runion_docids.union_with(&right); + } + } + } - intersect_docids.map_or(false, |i| !i.is_empty()) + let intersect_docids = &mut lunion_docids; + intersect_docids.intersect_with(&runion_docids); + + // *lnblookups = l_lookups; + // *lnbbitmaps = l_bitmaps; + // *rnblookups = r_lookups; + // *rnbbitmaps = r_bitmaps; + // *nb_docs_intersect += intersect_docids.len(); + + !intersect_docids.is_empty() }) }; @@ -196,7 +206,7 @@ impl Index { let mut key = word.clone(); key.extend_from_slice(&pos.to_be_bytes()); if let Some(attrs) = self.postings_ids.get(rtxn, &key)? { - let right = RoaringBitmap::deserialize_from(attrs)?; + let right = RoaringBitmap::deserialize_from_slice(attrs)?; union_docids.union_with(&right); count += 1; } @@ -248,7 +258,7 @@ impl Index { } } - debug_intersects_to_csv(debug_intersects); + // debug_intersects_to_csv(debug_intersects); eprintln!("{} candidates", documents.iter().map(RoaringBitmap::len).sum::()); Ok(documents.iter().flatten().take(20).collect())