mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-25 22:34:28 +01:00
Optimize things
This commit is contained in:
parent
a3ca80d20d
commit
55a8941922
31
Cargo.lock
generated
31
Cargo.lock
generated
@ -48,7 +48,7 @@ version = "1.2.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5753e2a71534719bf3f4e57006c3a4f0d2c672a4b676eec84161f763eca87dbf"
|
checksum = "5753e2a71534719bf3f4e57006c3a4f0d2c672a4b676eec84161f763eca87dbf"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"byteorder 1.3.4",
|
"byteorder",
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -75,7 +75,7 @@ checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"block-padding",
|
"block-padding",
|
||||||
"byte-tools",
|
"byte-tools",
|
||||||
"byteorder 1.3.4",
|
"byteorder",
|
||||||
"generic-array",
|
"generic-array",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -116,12 +116,6 @@ version = "0.3.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7"
|
checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "byteorder"
|
|
||||||
version = "0.5.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "byteorder"
|
name = "byteorder"
|
||||||
version = "1.3.4"
|
version = "1.3.4"
|
||||||
@ -445,7 +439,7 @@ version = "0.2.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
|
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"byteorder 1.3.4",
|
"byteorder",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -533,7 +527,7 @@ version = "0.8.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "fd7882b766b4be1b90d8ce5ce4c7aca2539b43176a708dbc8e79576dbbdbba93"
|
checksum = "fd7882b766b4be1b90d8ce5ce4c7aca2539b43176a708dbc8e79576dbbdbba93"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"byteorder 1.3.4",
|
"byteorder",
|
||||||
"heed-traits",
|
"heed-traits",
|
||||||
"heed-types",
|
"heed-types",
|
||||||
"libc",
|
"libc",
|
||||||
@ -784,7 +778,7 @@ version = "0.1.0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"bitpacking",
|
"bitpacking",
|
||||||
"byteorder 1.3.4",
|
"byteorder",
|
||||||
"cow-utils",
|
"cow-utils",
|
||||||
"csv",
|
"csv",
|
||||||
"fst",
|
"fst",
|
||||||
@ -1000,7 +994,7 @@ name = "oxidized-mtbl"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=9451be8#9451be8829562f7d1f8d34aa3ecb81c5106a0623"
|
source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=9451be8#9451be8829562f7d1f8d34aa3ecb81c5106a0623"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"byteorder 1.3.4",
|
"byteorder",
|
||||||
"crc32c",
|
"crc32c",
|
||||||
"flate2",
|
"flate2",
|
||||||
"snap",
|
"snap",
|
||||||
@ -1360,7 +1354,7 @@ version = "0.1.9"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
|
checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"byteorder 1.3.4",
|
"byteorder",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -1374,11 +1368,10 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "roaring"
|
name = "roaring"
|
||||||
version = "0.5.2"
|
version = "0.6.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/Kerollmops/roaring-rs.git?branch=deserialize-from-slice#24420bb9f980749476cec860ea8dd3c1683c0cd1"
|
||||||
checksum = "4af20e5d3e44732a57489fa297768ca29361b54fbc3b20cdeb738fa6932cc22d"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"byteorder 0.5.3",
|
"byteorder",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -1693,7 +1686,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "cfea31758bf674f990918962e8e5f07071a3161bd7c4138ed23e416e1ac4264e"
|
checksum = "cfea31758bf674f990918962e8e5f07071a3161bd7c4138ed23e416e1ac4264e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"base64 0.11.0",
|
"base64 0.11.0",
|
||||||
"byteorder 1.3.4",
|
"byteorder",
|
||||||
"bytes",
|
"bytes",
|
||||||
"http",
|
"http",
|
||||||
"httparse",
|
"httparse",
|
||||||
@ -1901,7 +1894,7 @@ version = "0.3.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46"
|
checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"byteorder 1.3.4",
|
"byteorder",
|
||||||
"zerocopy-derive",
|
"zerocopy-derive",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -20,7 +20,7 @@ memmap = "0.7.0"
|
|||||||
once_cell = "1.4.0"
|
once_cell = "1.4.0"
|
||||||
oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "9451be8" }
|
oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "9451be8" }
|
||||||
rayon = "1.3.0"
|
rayon = "1.3.0"
|
||||||
roaring = "0.5.2"
|
roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "deserialize-from-slice" }
|
||||||
slice-group-by = "0.2.6"
|
slice-group-by = "0.2.6"
|
||||||
smallstr = "0.2.0"
|
smallstr = "0.2.0"
|
||||||
smallvec = "1.4.0"
|
smallvec = "1.4.0"
|
||||||
@ -40,3 +40,6 @@ warp = "0.2.2"
|
|||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
debug = true
|
debug = true
|
||||||
|
|
||||||
|
[profile.bench]
|
||||||
|
debug = true
|
||||||
|
25
benches/search.rs
Normal file
25
benches/search.rs
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
#![feature(test)]
|
||||||
|
extern crate test;
|
||||||
|
|
||||||
|
use heed::EnvOpenOptions;
|
||||||
|
use mega_mini_indexer::Index;
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn search_minogue_kylie_live(b: &mut test::Bencher) {
|
||||||
|
let database = "books-4cpu.mmdb";
|
||||||
|
let query = "minogue kylie live";
|
||||||
|
|
||||||
|
std::fs::create_dir_all(database).unwrap();
|
||||||
|
let env = EnvOpenOptions::new()
|
||||||
|
.map_size(100 * 1024 * 1024 * 1024) // 100 GB
|
||||||
|
.max_readers(10)
|
||||||
|
.max_dbs(5)
|
||||||
|
.open(database).unwrap();
|
||||||
|
|
||||||
|
let index = Index::new(&env).unwrap();
|
||||||
|
|
||||||
|
b.iter(|| {
|
||||||
|
let rtxn = env.read_txn().unwrap();
|
||||||
|
let _documents_ids = index.search(&rtxn, query).unwrap();
|
||||||
|
})
|
||||||
|
}
|
78
src/lib.rs
78
src/lib.rs
@ -104,7 +104,7 @@ impl Index {
|
|||||||
while let Some(word) = stream.next() {
|
while let Some(word) = stream.next() {
|
||||||
let word = std::str::from_utf8(word)?;
|
let word = std::str::from_utf8(word)?;
|
||||||
if let Some(attrs) = self.postings_attrs.get(rtxn, word)? {
|
if let Some(attrs) = self.postings_attrs.get(rtxn, word)? {
|
||||||
let right = RoaringBitmap::deserialize_from(attrs)?;
|
let right = RoaringBitmap::deserialize_from_slice(attrs)?;
|
||||||
union_positions.union_with(&right);
|
union_positions.union_with(&right);
|
||||||
derived_words.push((word.as_bytes().to_vec(), right));
|
derived_words.push((word.as_bytes().to_vec(), right));
|
||||||
count += 1;
|
count += 1;
|
||||||
@ -120,57 +120,67 @@ impl Index {
|
|||||||
|
|
||||||
let mut documents = Vec::new();
|
let mut documents = Vec::new();
|
||||||
|
|
||||||
let mut debug_intersects = HashMap::new();
|
// let mut debug_intersects = HashMap::new();
|
||||||
let mut intersect_cache = HashMap::new();
|
let mut intersect_cache = HashMap::new();
|
||||||
|
let mut lunion_docids = RoaringBitmap::default();
|
||||||
|
let mut runion_docids = RoaringBitmap::default();
|
||||||
let contains_documents = |(lword, lpos): (usize, u32), (rword, rpos): (usize, u32)| {
|
let contains_documents = |(lword, lpos): (usize, u32), (rword, rpos): (usize, u32)| {
|
||||||
let proximity = best_proximity::positions_proximity(lpos, rpos);
|
let proximity = best_proximity::positions_proximity(lpos, rpos);
|
||||||
|
|
||||||
*intersect_cache.entry(((lword, lpos), (rword, rpos))).or_insert_with(|| {
|
*intersect_cache.entry(((lword, lpos), (rword, rpos))).or_insert_with(|| {
|
||||||
use std::iter::once;
|
// let (nb_words, nb_docs_intersect, lnblookups, lnbbitmaps, rnblookups, rnbbitmaps) =
|
||||||
|
// debug_intersects.entry((lword, lpos, rword, rpos, proximity)).or_default();
|
||||||
let (nb_words, nb_docs_intersect, lnblookups, lnbbitmaps, rnblookups, rnbbitmaps) =
|
|
||||||
debug_intersects.entry((lword, lpos, rword, rpos, proximity)).or_default();
|
|
||||||
|
|
||||||
let left = (&words[lword], lpos);
|
let left = (&words[lword], lpos);
|
||||||
let right = (&words[rword], rpos);
|
let right = (&words[rword], rpos);
|
||||||
|
|
||||||
*nb_words = left.0.len() + right.0.len();
|
// *nb_words = left.0.len() + right.0.len();
|
||||||
|
|
||||||
let mut l_lookups = 0;
|
let mut l_lookups = 0;
|
||||||
let mut l_bitmaps = 0;
|
let mut l_bitmaps = 0;
|
||||||
let mut r_lookups = 0;
|
let mut r_lookups = 0;
|
||||||
let mut r_bitmaps = 0;
|
let mut r_bitmaps = 0;
|
||||||
|
|
||||||
let mut intersect_docids: Option<RoaringBitmap> = None;
|
// This for the left word
|
||||||
for (i, (derived_words, pos))in once(left).chain(once(right)).enumerate() {
|
lunion_docids.clear();
|
||||||
let mut union_docids = RoaringBitmap::default();
|
for (word, attrs) in &words[lword] {
|
||||||
// TODO re-enable the prefixes system
|
if attrs.contains(lpos) {
|
||||||
for (word, attrs) in derived_words.iter() {
|
l_lookups += 1;
|
||||||
if attrs.contains(pos) {
|
let mut key = word.clone();
|
||||||
if i == 0 { l_lookups += 1 } else { r_lookups += 1 }
|
key.extend_from_slice(&lpos.to_be_bytes());
|
||||||
let mut key = word.clone();
|
if let Some(attrs) = self.postings_ids.get(rtxn, &key).unwrap() {
|
||||||
key.extend_from_slice(&pos.to_be_bytes());
|
l_bitmaps += 1;
|
||||||
if let Some(attrs) = self.postings_ids.get(rtxn, &key).unwrap() {
|
let right = RoaringBitmap::deserialize_from_slice(attrs).unwrap();
|
||||||
if i == 0 { l_bitmaps += 1 } else { r_bitmaps += 1 }
|
lunion_docids.union_with(&right);
|
||||||
let right = RoaringBitmap::deserialize_from(attrs).unwrap();
|
|
||||||
union_docids.union_with(&right);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
match &mut intersect_docids {
|
|
||||||
Some(left) => left.intersect_with(&union_docids),
|
|
||||||
None => intersect_docids = Some(union_docids),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
*lnblookups = l_lookups;
|
// This for the right word
|
||||||
*lnbbitmaps = l_bitmaps;
|
runion_docids.clear();
|
||||||
*rnblookups = r_lookups;
|
for (word, attrs) in &words[rword] {
|
||||||
*rnbbitmaps = r_bitmaps;
|
if attrs.contains(rpos) {
|
||||||
*nb_docs_intersect += intersect_docids.as_ref().map_or(0, |i| i.len());
|
r_lookups += 1;
|
||||||
|
let mut key = word.clone();
|
||||||
|
key.extend_from_slice(&rpos.to_be_bytes());
|
||||||
|
if let Some(attrs) = self.postings_ids.get(rtxn, &key).unwrap() {
|
||||||
|
r_bitmaps += 1;
|
||||||
|
let right = RoaringBitmap::deserialize_from_slice(attrs).unwrap();
|
||||||
|
runion_docids.union_with(&right);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
intersect_docids.map_or(false, |i| !i.is_empty())
|
let intersect_docids = &mut lunion_docids;
|
||||||
|
intersect_docids.intersect_with(&runion_docids);
|
||||||
|
|
||||||
|
// *lnblookups = l_lookups;
|
||||||
|
// *lnbbitmaps = l_bitmaps;
|
||||||
|
// *rnblookups = r_lookups;
|
||||||
|
// *rnbbitmaps = r_bitmaps;
|
||||||
|
// *nb_docs_intersect += intersect_docids.len();
|
||||||
|
|
||||||
|
!intersect_docids.is_empty()
|
||||||
})
|
})
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -196,7 +206,7 @@ impl Index {
|
|||||||
let mut key = word.clone();
|
let mut key = word.clone();
|
||||||
key.extend_from_slice(&pos.to_be_bytes());
|
key.extend_from_slice(&pos.to_be_bytes());
|
||||||
if let Some(attrs) = self.postings_ids.get(rtxn, &key)? {
|
if let Some(attrs) = self.postings_ids.get(rtxn, &key)? {
|
||||||
let right = RoaringBitmap::deserialize_from(attrs)?;
|
let right = RoaringBitmap::deserialize_from_slice(attrs)?;
|
||||||
union_docids.union_with(&right);
|
union_docids.union_with(&right);
|
||||||
count += 1;
|
count += 1;
|
||||||
}
|
}
|
||||||
@ -248,7 +258,7 @@ impl Index {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
debug_intersects_to_csv(debug_intersects);
|
// debug_intersects_to_csv(debug_intersects);
|
||||||
|
|
||||||
eprintln!("{} candidates", documents.iter().map(RoaringBitmap::len).sum::<u64>());
|
eprintln!("{} candidates", documents.iter().map(RoaringBitmap::len).sum::<u64>());
|
||||||
Ok(documents.iter().flatten().take(20).collect())
|
Ok(documents.iter().flatten().take(20).collect())
|
||||||
|
Loading…
Reference in New Issue
Block a user