From 91ba938953e202d8ede5a67ec1888c03a9ab239a Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 25 May 2020 20:39:53 +0200 Subject: [PATCH] Initial commit --- .gitignore | 1 + Cargo.lock | 749 ++++++++++++++++++++++++++++++++++ Cargo.toml | 27 ++ qc_loop.sh | 11 + src/bp_vec.rs | 197 +++++++++ src/codec/bitpacker_sorted.rs | 84 ++++ src/codec/mod.rs | 3 + src/main.rs | 186 +++++++++ 8 files changed, 1258 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100755 qc_loop.sh create mode 100644 src/bp_vec.rs create mode 100644 src/codec/bitpacker_sorted.rs create mode 100644 src/codec/mod.rs create mode 100644 src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..ea8c4bf7f --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 000000000..18931ea68 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,749 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "aho-corasick" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8716408b8bc624ed7f65d223ddb9ac2d044c0547b6fa4b0d554f3a9540496ada" +dependencies = [ + "memchr", +] + +[[package]] +name = "anyhow" +version = "1.0.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f" + +[[package]] +name = "autocfg" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d" + +[[package]] +name = "bitflags" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" + +[[package]] +name = "bitpacking" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3744aff20a3437a99ebc0bb7733e9e60c7bf590478c9b897e95b38d57e5acb68" +dependencies = [ + "crunchy", +] + +[[package]] +name = "bstr" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31accafdb70df7871592c058eca3985b71104e15ac32f64706022c58867da931" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "byteorder" +version = "1.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" + +[[package]] +name = "cc" +version = "1.0.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bbb73db36c1246e9034e307d0fba23f9a2e251faa47ade70c1bd252220c8311" + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + +[[package]] +name = "clap" +version = "2.33.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdfa80d47f954d53a35a64987ca1422f495b8d6483c0fe9f7117b36c2a792129" +dependencies = [ + "bitflags", + "textwrap", + "unicode-width", +] + +[[package]] +name = "cloudabi" +version = "0.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +dependencies = [ + "bitflags", +] + +[[package]] +name = "crc32fast" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-deque" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f02af974daeee82218205558e51ec8768b48cf524bd01d550abe5573a608285" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", + "maybe-uninit", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "lazy_static", + "maybe-uninit", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-queue" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c695eeca1e7173472a32221542ae469b3e9aac3a4fc81f7696bcad82029493db" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8" +dependencies = [ + "autocfg", + "cfg-if", + "lazy_static", +] + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "csv" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00affe7f6ab566df61b4be3ce8cf16bc2576bca0963ceb0955e45d514bf9a279" +dependencies = [ + "bstr", + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "either" +version = "1.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3" + +[[package]] +name = "env_logger" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "fs2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "fs_extra" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f2a4a2034423744d2cc7ca2068453168dcdb82c438419e639a26bd87839c674" + +[[package]] +name = "fst" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7293de202dbfe786c0b3fe6110a027836c5438ed06db7b715c9955ff4bfea51" + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getrandom" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "heck" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "hermit-abi" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91780f809e750b0a89f5544be56617ff6b1227ee485bcb06ebe10cdf89bd3b71" +dependencies = [ + "libc", +] + +[[package]] +name = "itoa" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8b7a7c0c47db5545ed3fef7468ee7bb5b74691498139e4b3f6a20685dc6dd8e" + +[[package]] +name = "jemalloc-sys" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45" +dependencies = [ + "cc", + "fs_extra", + "libc", +] + +[[package]] +name = "jemallocator" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69" +dependencies = [ + "jemalloc-sys", + "libc", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3baa92041a6fec78c687fa0cc2b3fae8884f743d672cf551bed1d6dac6988d0f" + +[[package]] +name = "lock_api" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4da24a77a3d8a6d4862d95f72e6fdb9c09a643ecdb402d754004a557f2bec75" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "maybe-uninit" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" + +[[package]] +name = "mega-mini-indexer" +version = "0.1.0" +dependencies = [ + "anyhow", + "bitpacking", + "byteorder", + "csv", + "fst", + "fxhash", + "jemallocator", + "quickcheck", + "rayon", + "sdset", + "sled", + "slice-group-by", + "smallstr", + "structopt", + "zerocopy", +] + +[[package]] +name = "memchr" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" + +[[package]] +name = "memoffset" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4fc2c02a7e374099d4ee95a193111f72d2110197fe200272371758f6c3643d8" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "parking_lot" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3a704eb390aafdc107b0e392f56a82b668e3a71366993b5340f5833fd62505e" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d58c7c768d4ba344e3e8d72518ac13e259d7c7ade24167003b8488e10b6740a3" +dependencies = [ + "cfg-if", + "cloudabi", + "libc", + "redox_syscall", + "smallvec", + "winapi", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "237a5ed80e274dbc66f86bd59c1e25edc039660be53194b5fe0a482e0f2612ea" + +[[package]] +name = "proc-macro-error" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98e9e4b82e0ef281812565ea4751049f1bdcdfccda7d3f459f2e138a40c08678" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f5444ead4e9935abd7f27dc51f7e852a0569ac888096d5ec2499470794e2e53" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "syn-mid", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1502d12e458c49a4c9cbff560d0fe0060c252bc29799ed94ca2ed4bb665a0101" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "quickcheck" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44883e74aa97ad63db83c4bf8ca490f02b2fc02f92575e720c8551e843c945f" +dependencies = [ + "env_logger", + "log", + "rand", + "rand_core", +] + +[[package]] +name = "quote" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a21852a652ad6f610c9510194f398ff6f8692e334fd1145fed931f7fbe44ea" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom", + "libc", + "rand_chacha", + "rand_core", + "rand_hc", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rayon" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db6ce3297f9c85e16621bb8cca38a06779ffc31bb8184e1be4bed2be4678a098" +dependencies = [ + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08a89b46efaf957e52b18062fb2f4660f8b8a4dde1807ca002690868ef2c85a9" +dependencies = [ + "crossbeam-deque", + "crossbeam-queue", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + +[[package]] +name = "redox_syscall" +version = "0.1.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84" + +[[package]] +name = "regex" +version = "1.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6020f034922e3194c711b82a627453881bc4682166cabb07134a10c26ba7692" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", + "thread_local", +] + +[[package]] +name = "regex-automata" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" +dependencies = [ + "byteorder", +] + +[[package]] +name = "regex-syntax" +version = "0.6.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae" + +[[package]] +name = "ryu" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3d612bc64430efeb3f7ee6ef26d590dce0c43249217bddc62112540c7941e1" + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "sdset" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbb21fe0588557792176c89bc7b943027b14f346d03c6be6a199c2860277d93a" + +[[package]] +name = "serde" +version = "1.0.110" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99e7b308464d16b56eba9964e4972a3eee817760ab60d88c3f86e1fecb08204c" + +[[package]] +name = "sled" +version = "0.31.0" +source = "git+https://github.com/spacejam/sled.git?rev=2fe05c9#2fe05c933a4a68d4dbbc06a16a3058236fcc6350" +dependencies = [ + "crc32fast", + "crossbeam-epoch", + "crossbeam-utils", + "fs2", + "fxhash", + "libc", + "log", + "parking_lot", +] + +[[package]] +name = "slice-group-by" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f7474f0b646d228360ab62ed974744617bc869d959eac8403bfa3665931a7fb" + +[[package]] +name = "smallstr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e922794d168678729ffc7e07182721a14219c65814e66e91b839a272fe5ae4f" +dependencies = [ + "smallvec", +] + +[[package]] +name = "smallvec" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7cb5678e1615754284ec264d9bb5b4c27d2018577fd90ac0ceb578591ed5ee4" + +[[package]] +name = "structopt" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "863246aaf5ddd0d6928dfeb1a9ca65f505599e4e1b399935ef7e75107516b4ef" +dependencies = [ + "clap", + "lazy_static", + "structopt-derive", +] + +[[package]] +name = "structopt-derive" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d239ca4b13aee7a2142e6795cbd69e457665ff8037aed33b3effdc430d2f927a" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "syn" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95b5f192649e48a5302a13f2feb224df883b98933222369e4b3b0fe2a5447269" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "syn-mid" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7be3539f6c128a931cf19dcee741c1af532c7fd387baa739c03dd2e96479338a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "synstructure" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67656ea1dc1b41b1451851562ea232ec2e5a80242139f7e679ceccfb5d61f545" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "unicode-xid", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "thread_local" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "unicode-segmentation" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0" + +[[package]] +name = "unicode-width" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479" + +[[package]] +name = "unicode-xid" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" + +[[package]] +name = "version_check" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" + +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + +[[package]] +name = "winapi" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "zerocopy" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d498dbd1fd7beb83c86709ae1c33ca50942889473473d287d56ce4770a18edfb" +dependencies = [ + "proc-macro2", + "syn", + "synstructure", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 000000000..cfa000799 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "mega-mini-indexer" +version = "0.1.0" +authors = ["Kerollmops "] +edition = "2018" + +[dependencies] +anyhow = "1.0.28" +bitpacking = "0.8.2" +byteorder = "1.3.4" +csv = "1.1.3" +fst = "0.4.3" +fxhash = "0.2.1" +jemallocator = "0.3.2" +rayon = "1.3.0" +sdset = "0.4.0" +sled = { git = "https://github.com/spacejam/sled.git", rev = "2fe05c9"} +slice-group-by = "0.2.6" +smallstr = "0.2.0" +structopt = { version = "0.3.14", default-features = false } +zerocopy = "0.3.0" + +[dev-dependencies] +quickcheck = "0.9.2" + +[profile.release] +debug = true diff --git a/qc_loop.sh b/qc_loop.sh new file mode 100755 index 000000000..c479307cc --- /dev/null +++ b/qc_loop.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +export RUST_BACKTRACE=1 + +while true +do + cargo test qc_ --release -- --nocapture + if [[ x$? != x0 ]] ; then + exit $? + fi +done diff --git a/src/bp_vec.rs b/src/bp_vec.rs new file mode 100644 index 000000000..d567ac0a5 --- /dev/null +++ b/src/bp_vec.rs @@ -0,0 +1,197 @@ +use byteorder::{ByteOrder, NativeEndian}; +use bitpacking::{BitPacker, BitPacker4x}; + +/// An append only bitpacked u32 vector that ignore order of insertion. +#[derive(Default)] +pub struct BpVec { + compressed: Vec, + uncompressed: Vec, +} + +impl BpVec { + pub fn new() -> BpVec { + BpVec::default() + } + + pub fn push(&mut self, elem: u32) { + self.uncompressed.push(elem); + if self.uncompressed.len() == BitPacker4x::BLOCK_LEN { + encode(&mut self.uncompressed[..], &mut self.compressed); + self.uncompressed.clear(); + } + } + + pub fn extend_from_slice(&mut self, elems: &[u32]) { + self.uncompressed.extend_from_slice(elems); + let remaining = self.uncompressed.len() % BitPacker4x::BLOCK_LEN; + for chunk in self.uncompressed[remaining..].chunks_exact_mut(BitPacker4x::BLOCK_LEN) { + encode(chunk, &mut self.compressed); + } + self.uncompressed.truncate(remaining); + self.uncompressed.shrink_to_fit(); + } + + pub fn to_vec(self) -> Vec { + let BpVec { compressed, mut uncompressed } = self; + decode(&compressed, &mut uncompressed); + uncompressed + } + + pub fn capacity(&self) -> usize { + self.compressed.capacity() + self.uncompressed.capacity() + } +} + +fn encode(items: &mut [u32], encoded: &mut Vec) { + assert_eq!(items.len(), BitPacker4x::BLOCK_LEN); + + let bitpacker = BitPacker4x::new(); + + // We reserve enough space in the output buffer, filled with zeroes. + let len = encoded.len(); + // initial_value + num_bits + encoded numbers + let max_possible_length = 4 + 1 + 4 * BitPacker4x::BLOCK_LEN; + encoded.resize(len + max_possible_length, 0); + + // We sort the items to be able to efficiently bitpack them. + items.sort_unstable(); + // We save the initial value to us for this block, the lowest one. + let initial_value = items[0]; + // We compute the number of bits necessary to encode this block + let num_bits = bitpacker.num_bits_sorted(initial_value, items); + + // We write the initial value for this block. + let buffer = &mut encoded[len..]; + NativeEndian::write_u32(buffer, initial_value); + // We write the num_bits that will be read to decode this block + let buffer = &mut buffer[4..]; + buffer[0] = num_bits; + // We encode the block numbers into the buffer using the num_bits + let buffer = &mut buffer[1..]; + let compressed_len = bitpacker.compress_sorted(initial_value, items, buffer, num_bits); + + // We truncate the buffer to the avoid leaking padding zeroes + encoded.truncate(len + 4 + 1 + compressed_len); +} + +fn decode(mut encoded: &[u8], decoded: &mut Vec) { + let bitpacker = BitPacker4x::new(); + + // initial_value + num_bits + while let Some(header) = encoded.get(0..4 + 1) { + // We extract the header informations + let initial_value = NativeEndian::read_u32(header); + let num_bits = header[4]; + let bytes = &encoded[4 + 1..]; + + // If the num_bits is equal to zero it means that all encoded numbers were zeroes + if num_bits == 0 { + decoded.resize(decoded.len() + BitPacker4x::BLOCK_LEN, initial_value); + encoded = bytes; + continue; + } + + // We guess the block size based on the num_bits used for this block + let block_size = BitPacker4x::compressed_block_size(num_bits); + + // We pad the decoded vector with zeroes + let new_len = decoded.len() + BitPacker4x::BLOCK_LEN; + decoded.resize(new_len, 0); + + // Create a view into the decoded buffer and decode into it + let to_decompress = &mut decoded[new_len - BitPacker4x::BLOCK_LEN..new_len]; + bitpacker.decompress_sorted(initial_value, &bytes[..block_size], to_decompress, num_bits); + + // Advance the bytes offset to read the next block (+ num_bits) + encoded = &bytes[block_size..]; + } +} + +impl sdset::Collection for BpVec { + fn push(&mut self, elem: u32) { + BpVec::push(self, elem); + } + + fn extend_from_slice(&mut self, elems: &[u32]) { + BpVec::extend_from_slice(self, elems); + } + + fn extend(&mut self, elems: I) where I: IntoIterator { + elems.into_iter().for_each(|x| BpVec::push(self, x)); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + quickcheck! { + fn qc_push(xs: Vec) -> bool { + let mut xs: Vec<_> = xs.iter().cloned().cycle().take(1300).collect(); + + let mut bpvec = BpVec::new(); + xs.iter().for_each(|x| bpvec.push(*x)); + let mut result = bpvec.to_vec(); + + result.sort_unstable(); + xs.sort_unstable(); + + xs == result + } + } + + quickcheck! { + fn qc_extend_from_slice(xs: Vec) -> bool { + let mut xs: Vec<_> = xs.iter().cloned().cycle().take(1300).collect(); + + let mut bpvec = BpVec::new(); + bpvec.extend_from_slice(&xs); + let mut result = bpvec.to_vec(); + + result.sort_unstable(); + xs.sort_unstable(); + + xs == result + } + } + + #[test] + fn empty() { + let mut bpvec = BpVec::new(); + bpvec.extend_from_slice(&[]); + let result = bpvec.to_vec(); + + assert!(result.is_empty()); + } + + #[test] + fn one_zero() { + let mut bpvec = BpVec::new(); + bpvec.extend_from_slice(&[0]); + let result = bpvec.to_vec(); + + assert_eq!(&[0], &*result); + } + + #[test] + fn many_zeros() { + let xs: Vec<_> = std::iter::repeat(0).take(1300).collect(); + + let mut bpvec = BpVec::new(); + bpvec.extend_from_slice(&xs); + let result = bpvec.to_vec(); + + assert_eq!(xs, result); + } + + #[test] + fn many_ones() { + let xs: Vec<_> = std::iter::repeat(1).take(1300).collect(); + + let mut bpvec = BpVec::new(); + bpvec.extend_from_slice(&xs); + let result = bpvec.to_vec(); + + assert_eq!(xs, result); + } +} diff --git a/src/codec/bitpacker_sorted.rs b/src/codec/bitpacker_sorted.rs new file mode 100644 index 000000000..274e2c2bb --- /dev/null +++ b/src/codec/bitpacker_sorted.rs @@ -0,0 +1,84 @@ +use bitpacking::{BitPacker, BitPacker4x}; +use byteorder::{ReadBytesExt, NativeEndian}; +use zerocopy::AsBytes; + +pub struct CodecBitPacker4xSorted; + +impl CodecBitPacker4xSorted { + pub fn bytes_encode(item: &[u32]) -> Option> { + // This is a hotfix to the SIGSEGV + // https://github.com/tantivy-search/bitpacking/issues/23 + if item.is_empty() { + return Some(Vec::default()) + } + + let bitpacker = BitPacker4x::new(); + let mut compressed = Vec::new(); + let mut initial_value = 0; + + // The number of remaining numbers that don't fit in the block size. + compressed.push((item.len() % BitPacker4x::BLOCK_LEN) as u8); + + // we cannot use a mut slice here because of #68630, TooGeneric error. + // we can probably avoid this new allocation by directly using the compressed final Vec. + let mut buffer = vec![0u8; 4 * BitPacker4x::BLOCK_LEN]; + + for chunk in item.chunks(BitPacker4x::BLOCK_LEN) { + if chunk.len() == BitPacker4x::BLOCK_LEN { + // compute the number of bits necessary to encode this block + let num_bits = bitpacker.num_bits_sorted(initial_value, chunk); + // Encode the block numbers into the buffer using the num_bits + let compressed_len = bitpacker.compress_sorted(initial_value, chunk, &mut buffer, num_bits); + // Write the num_bits that will be read to decode this block + compressed.push(num_bits); + // Wrtie the bytes of the compressed block numbers + compressed.extend_from_slice(&buffer[..compressed_len]); + // Save the initial_value, which is the last value of the n-1 used for the n block + initial_value = *chunk.last().unwrap(); + } else { + // Save the remaining numbers which don't fit inside of a BLOCK_LEN + compressed.extend_from_slice(chunk.as_bytes()); + } + } + + Some(compressed) + } + + pub fn bytes_decode(bytes: &[u8]) -> Option> { + if bytes.is_empty() { + return Some(Vec::new()) + } + + let bitpacker = BitPacker4x::new(); + let (remaining, bytes) = bytes.split_first().unwrap(); + let remaining = *remaining as usize; + + let (mut bytes, mut remaining_bytes) = bytes.split_at(bytes.len() - remaining * 4); + let mut decompressed = Vec::new(); + let mut initial_value = 0; + + while let Some(num_bits) = bytes.get(0) { + let block_size = BitPacker4x::compressed_block_size(*num_bits); + + let new_len = decompressed.len() + BitPacker4x::BLOCK_LEN; + decompressed.resize(new_len, 0); + + // Create a view into the decompressed buffer and decomress into it + let to_decompress = &mut decompressed[new_len - BitPacker4x::BLOCK_LEN..new_len]; + bitpacker.decompress_sorted(initial_value, &bytes[1..block_size + 1], to_decompress, *num_bits); + + // Set the new initial_value for the next block + initial_value = *decompressed.last().unwrap(); + // Advance the bytes offset to read the next block (+ num_bits) + bytes = &bytes[block_size + 1..]; + } + + // We add the remaining uncompressed numbers. + let new_len = decompressed.len() + remaining; + decompressed.resize(new_len, 0); + let to_decompress = &mut decompressed[new_len - remaining..new_len]; + remaining_bytes.read_u32_into::(to_decompress).ok()?; + + Some(decompressed) + } +} diff --git a/src/codec/mod.rs b/src/codec/mod.rs new file mode 100644 index 000000000..451839fea --- /dev/null +++ b/src/codec/mod.rs @@ -0,0 +1,3 @@ +mod bitpacker_sorted; + +pub use self::bitpacker_sorted::CodecBitPacker4xSorted; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 000000000..c8d345baf --- /dev/null +++ b/src/main.rs @@ -0,0 +1,186 @@ +#[cfg(test)] +#[macro_use] extern crate quickcheck; + +mod codec; +mod bp_vec; + +use std::collections::{HashMap, BTreeSet}; +use std::convert::TryFrom; +use std::fs::File; +use std::hash::BuildHasherDefault; +use std::path::PathBuf; + +use anyhow::{ensure, Context}; +use fst::IntoStreamer; +use fxhash::FxHasher32; +use rayon::prelude::*; +use sdset::{SetOperation, SetBuf}; +use slice_group_by::StrGroupBy; +use structopt::StructOpt; + +use self::codec::CodecBitPacker4xSorted; +use self::bp_vec::BpVec; + +pub type FastMap4 = HashMap>; +pub type SmallString32 = smallstr::SmallString<[u8; 32]>; + +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +#[derive(Debug, StructOpt)] +#[structopt(name = "mm-indexer", about = "The server side of the daugt project.")] +struct Opt { + /// The database path where the database is located. + /// It is created if it doesn't already exist. + #[structopt(long = "db", parse(from_os_str))] + database: PathBuf, + + /// Files to index in parallel. + files_to_index: Vec, +} + +fn union_bitpacked_postings_ids(_key: &[u8], old_value: Option<&[u8]>, new_value: &[u8]) -> Option> { + if old_value.is_none() { + return Some(new_value.to_vec()) + } + + let old_value = old_value.unwrap_or_default(); + let old_value = CodecBitPacker4xSorted::bytes_decode(&old_value).unwrap(); + let new_value = CodecBitPacker4xSorted::bytes_decode(&new_value).unwrap(); + + let old_set = SetBuf::new(old_value).unwrap(); + let new_set = SetBuf::new(new_value).unwrap(); + + let result = sdset::duo::Union::new(&old_set, &new_set).into_set_buf(); + let compressed = CodecBitPacker4xSorted::bytes_encode(&result).unwrap(); + + Some(compressed) +} + +fn union_words_fst(key: &[u8], old_value: Option<&[u8]>, new_value: &[u8]) -> Option> { + if key != b"words-fst" { unimplemented!() } + + let old_value = match old_value { + Some(old_value) => old_value, + None => return Some(new_value.to_vec()), + }; + + eprintln!("old_words size: {}", old_value.len()); + eprintln!("new_words size: {}", new_value.len()); + + let old_words = fst::Set::new(old_value).unwrap(); + let new_words = fst::Set::new(new_value).unwrap(); + + // Do an union of the old and the new set of words. + let op = old_words.op().add(new_words.into_stream()).r#union(); + let mut build = fst::SetBuilder::memory(); + build.extend_stream(op.into_stream()).unwrap(); + + Some(build.into_inner().unwrap()) +} + +fn alphanumeric_tokens(string: &str) -> impl Iterator { + let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric); + string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) +} + +fn index_csv(tid: usize, db: sled::Db, mut rdr: csv::Reader) -> anyhow::Result { + const MAX_POSITION: usize = 1000; + const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; + + let main = &*db; + let postings_ids = db.open_tree("postings-ids")?; + let documents = db.open_tree("documents")?; + + let mut document = csv::StringRecord::new(); + let mut new_postings_ids = FastMap4::default(); + let mut new_words = BTreeSet::default(); + let mut number_of_documents = 0; + + // Write the headers into a Vec of bytes. + let headers = rdr.headers()?; + let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new()); + writer.write_byte_record(headers.as_byte_record())?; + let headers = writer.into_inner()?; + + if let Some(old_headers) = main.insert("headers", headers.as_slice())? { + ensure!(old_headers == headers, "headers differs from the previous ones"); + } + + while rdr.read_record(&mut document)? { + let document_id = db.generate_id()?; + let document_id = u32::try_from(document_id).context("Generated id is too big")?; + + for (_attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { + for (_pos, word) in alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) { + new_postings_ids.entry(SmallString32::from(word)).or_insert_with(BpVec::new).push(document_id); + } + } + + // We write the document in the database. + let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new()); + writer.write_byte_record(document.as_byte_record())?; + let document = writer.into_inner()?; + documents.insert(document_id.to_be_bytes(), document)?; + + number_of_documents += 1; + if number_of_documents % 100000 == 0 { + let postings_ids_size = new_postings_ids.iter().map(|(_, v)| v.capacity() * 4).sum::(); + eprintln!("{}, documents seen {}, postings size {}", + tid, number_of_documents, postings_ids_size); + } + } + + eprintln!("Start collecting the postings lists and words"); + + // We compute and store the postings list into the DB. + for (word, new_ids) in new_postings_ids { + let new_ids = SetBuf::from_dirty(new_ids.to_vec()); + let compressed = CodecBitPacker4xSorted::bytes_encode(&new_ids) + .context("error while compressing using CodecBitPacker4xSorted")?; + + postings_ids.merge(word.as_bytes(), compressed)?; + + new_words.insert(word); + } + + eprintln!("Finished collecting the postings lists and words"); + + eprintln!("Start merging the words-fst"); + + let new_words_fst = fst::Set::from_iter(new_words.iter().map(|s| s.as_str()))?; + drop(new_words); + main.merge("words-fst", new_words_fst.as_fst().as_bytes())?; + + eprintln!("Finished merging the words-fst"); + + Ok(number_of_documents) +} + +fn main() -> anyhow::Result<()> { + let opt = Opt::from_args(); + + let db = sled::open(opt.database)?; + let main = &*db; + + // Setup the merge operators + main.set_merge_operator(union_words_fst); + let postings_ids = db.open_tree("postings-ids")?; + postings_ids.set_merge_operator(union_bitpacked_postings_ids); + // ... + let _documents = db.open_tree("documents")?; + + let res = opt.files_to_index + .into_par_iter() + .enumerate() + .map(|(tid, path)| { + let rdr = csv::Reader::from_path(path)?; + index_csv(tid, db.clone(), rdr) + }) + .try_reduce(|| 0, |a, b| Ok(a + b)); + + println!("{:?}", res); + + Ok(()) +}