Initial commit

This commit is contained in:
Kerollmops 2020-05-25 20:39:53 +02:00
parent 4573f00a0d
commit 91ba938953
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
8 changed files with 1258 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target

749
Cargo.lock generated Normal file
View File

@ -0,0 +1,749 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
[[package]]
name = "aho-corasick"
version = "0.7.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8716408b8bc624ed7f65d223ddb9ac2d044c0547b6fa4b0d554f3a9540496ada"
dependencies = [
"memchr",
]
[[package]]
name = "anyhow"
version = "1.0.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f"
[[package]]
name = "autocfg"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d"
[[package]]
name = "bitflags"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
[[package]]
name = "bitpacking"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3744aff20a3437a99ebc0bb7733e9e60c7bf590478c9b897e95b38d57e5acb68"
dependencies = [
"crunchy",
]
[[package]]
name = "bstr"
version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "31accafdb70df7871592c058eca3985b71104e15ac32f64706022c58867da931"
dependencies = [
"lazy_static",
"memchr",
"regex-automata",
"serde",
]
[[package]]
name = "byteorder"
version = "1.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de"
[[package]]
name = "cc"
version = "1.0.54"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7bbb73db36c1246e9034e307d0fba23f9a2e251faa47ade70c1bd252220c8311"
[[package]]
name = "cfg-if"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
[[package]]
name = "clap"
version = "2.33.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bdfa80d47f954d53a35a64987ca1422f495b8d6483c0fe9f7117b36c2a792129"
dependencies = [
"bitflags",
"textwrap",
"unicode-width",
]
[[package]]
name = "cloudabi"
version = "0.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
dependencies = [
"bitflags",
]
[[package]]
name = "crc32fast"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1"
dependencies = [
"cfg-if",
]
[[package]]
name = "crossbeam-deque"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f02af974daeee82218205558e51ec8768b48cf524bd01d550abe5573a608285"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
"maybe-uninit",
]
[[package]]
name = "crossbeam-epoch"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace"
dependencies = [
"autocfg",
"cfg-if",
"crossbeam-utils",
"lazy_static",
"maybe-uninit",
"memoffset",
"scopeguard",
]
[[package]]
name = "crossbeam-queue"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c695eeca1e7173472a32221542ae469b3e9aac3a4fc81f7696bcad82029493db"
dependencies = [
"cfg-if",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8"
dependencies = [
"autocfg",
"cfg-if",
"lazy_static",
]
[[package]]
name = "crunchy"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
[[package]]
name = "csv"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00affe7f6ab566df61b4be3ce8cf16bc2576bca0963ceb0955e45d514bf9a279"
dependencies = [
"bstr",
"csv-core",
"itoa",
"ryu",
"serde",
]
[[package]]
name = "csv-core"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
dependencies = [
"memchr",
]
[[package]]
name = "either"
version = "1.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3"
[[package]]
name = "env_logger"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36"
dependencies = [
"log",
"regex",
]
[[package]]
name = "fs2"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
dependencies = [
"libc",
"winapi",
]
[[package]]
name = "fs_extra"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f2a4a2034423744d2cc7ca2068453168dcdb82c438419e639a26bd87839c674"
[[package]]
name = "fst"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7293de202dbfe786c0b3fe6110a027836c5438ed06db7b715c9955ff4bfea51"
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "getrandom"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb"
dependencies = [
"cfg-if",
"libc",
"wasi",
]
[[package]]
name = "heck"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205"
dependencies = [
"unicode-segmentation",
]
[[package]]
name = "hermit-abi"
version = "0.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91780f809e750b0a89f5544be56617ff6b1227ee485bcb06ebe10cdf89bd3b71"
dependencies = [
"libc",
]
[[package]]
name = "itoa"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8b7a7c0c47db5545ed3fef7468ee7bb5b74691498139e4b3f6a20685dc6dd8e"
[[package]]
name = "jemalloc-sys"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45"
dependencies = [
"cc",
"fs_extra",
"libc",
]
[[package]]
name = "jemallocator"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69"
dependencies = [
"jemalloc-sys",
"libc",
]
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.70"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3baa92041a6fec78c687fa0cc2b3fae8884f743d672cf551bed1d6dac6988d0f"
[[package]]
name = "lock_api"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4da24a77a3d8a6d4862d95f72e6fdb9c09a643ecdb402d754004a557f2bec75"
dependencies = [
"scopeguard",
]
[[package]]
name = "log"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7"
dependencies = [
"cfg-if",
]
[[package]]
name = "maybe-uninit"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
[[package]]
name = "mega-mini-indexer"
version = "0.1.0"
dependencies = [
"anyhow",
"bitpacking",
"byteorder",
"csv",
"fst",
"fxhash",
"jemallocator",
"quickcheck",
"rayon",
"sdset",
"sled",
"slice-group-by",
"smallstr",
"structopt",
"zerocopy",
]
[[package]]
name = "memchr"
version = "2.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400"
[[package]]
name = "memoffset"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4fc2c02a7e374099d4ee95a193111f72d2110197fe200272371758f6c3643d8"
dependencies = [
"autocfg",
]
[[package]]
name = "num_cpus"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
dependencies = [
"hermit-abi",
"libc",
]
[[package]]
name = "parking_lot"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3a704eb390aafdc107b0e392f56a82b668e3a71366993b5340f5833fd62505e"
dependencies = [
"lock_api",
"parking_lot_core",
]
[[package]]
name = "parking_lot_core"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d58c7c768d4ba344e3e8d72518ac13e259d7c7ade24167003b8488e10b6740a3"
dependencies = [
"cfg-if",
"cloudabi",
"libc",
"redox_syscall",
"smallvec",
"winapi",
]
[[package]]
name = "ppv-lite86"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "237a5ed80e274dbc66f86bd59c1e25edc039660be53194b5fe0a482e0f2612ea"
[[package]]
name = "proc-macro-error"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "98e9e4b82e0ef281812565ea4751049f1bdcdfccda7d3f459f2e138a40c08678"
dependencies = [
"proc-macro-error-attr",
"proc-macro2",
"quote",
"syn",
"version_check",
]
[[package]]
name = "proc-macro-error-attr"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f5444ead4e9935abd7f27dc51f7e852a0569ac888096d5ec2499470794e2e53"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn-mid",
"version_check",
]
[[package]]
name = "proc-macro2"
version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1502d12e458c49a4c9cbff560d0fe0060c252bc29799ed94ca2ed4bb665a0101"
dependencies = [
"unicode-xid",
]
[[package]]
name = "quickcheck"
version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a44883e74aa97ad63db83c4bf8ca490f02b2fc02f92575e720c8551e843c945f"
dependencies = [
"env_logger",
"log",
"rand",
"rand_core",
]
[[package]]
name = "quote"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54a21852a652ad6f610c9510194f398ff6f8692e334fd1145fed931f7fbe44ea"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
dependencies = [
"getrandom",
"libc",
"rand_chacha",
"rand_core",
"rand_hc",
]
[[package]]
name = "rand_chacha"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
dependencies = [
"ppv-lite86",
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
dependencies = [
"getrandom",
]
[[package]]
name = "rand_hc"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
dependencies = [
"rand_core",
]
[[package]]
name = "rayon"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db6ce3297f9c85e16621bb8cca38a06779ffc31bb8184e1be4bed2be4678a098"
dependencies = [
"crossbeam-deque",
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08a89b46efaf957e52b18062fb2f4660f8b8a4dde1807ca002690868ef2c85a9"
dependencies = [
"crossbeam-deque",
"crossbeam-queue",
"crossbeam-utils",
"lazy_static",
"num_cpus",
]
[[package]]
name = "redox_syscall"
version = "0.1.56"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84"
[[package]]
name = "regex"
version = "1.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6020f034922e3194c711b82a627453881bc4682166cabb07134a10c26ba7692"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
"thread_local",
]
[[package]]
name = "regex-automata"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
dependencies = [
"byteorder",
]
[[package]]
name = "regex-syntax"
version = "0.6.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae"
[[package]]
name = "ryu"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed3d612bc64430efeb3f7ee6ef26d590dce0c43249217bddc62112540c7941e1"
[[package]]
name = "scopeguard"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "sdset"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbb21fe0588557792176c89bc7b943027b14f346d03c6be6a199c2860277d93a"
[[package]]
name = "serde"
version = "1.0.110"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99e7b308464d16b56eba9964e4972a3eee817760ab60d88c3f86e1fecb08204c"
[[package]]
name = "sled"
version = "0.31.0"
source = "git+https://github.com/spacejam/sled.git?rev=2fe05c9#2fe05c933a4a68d4dbbc06a16a3058236fcc6350"
dependencies = [
"crc32fast",
"crossbeam-epoch",
"crossbeam-utils",
"fs2",
"fxhash",
"libc",
"log",
"parking_lot",
]
[[package]]
name = "slice-group-by"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f7474f0b646d228360ab62ed974744617bc869d959eac8403bfa3665931a7fb"
[[package]]
name = "smallstr"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e922794d168678729ffc7e07182721a14219c65814e66e91b839a272fe5ae4f"
dependencies = [
"smallvec",
]
[[package]]
name = "smallvec"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7cb5678e1615754284ec264d9bb5b4c27d2018577fd90ac0ceb578591ed5ee4"
[[package]]
name = "structopt"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "863246aaf5ddd0d6928dfeb1a9ca65f505599e4e1b399935ef7e75107516b4ef"
dependencies = [
"clap",
"lazy_static",
"structopt-derive",
]
[[package]]
name = "structopt-derive"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d239ca4b13aee7a2142e6795cbd69e457665ff8037aed33b3effdc430d2f927a"
dependencies = [
"heck",
"proc-macro-error",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "syn"
version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95b5f192649e48a5302a13f2feb224df883b98933222369e4b3b0fe2a5447269"
dependencies = [
"proc-macro2",
"quote",
"unicode-xid",
]
[[package]]
name = "syn-mid"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7be3539f6c128a931cf19dcee741c1af532c7fd387baa739c03dd2e96479338a"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "synstructure"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67656ea1dc1b41b1451851562ea232ec2e5a80242139f7e679ceccfb5d61f545"
dependencies = [
"proc-macro2",
"quote",
"syn",
"unicode-xid",
]
[[package]]
name = "textwrap"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
dependencies = [
"unicode-width",
]
[[package]]
name = "thread_local"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
dependencies = [
"lazy_static",
]
[[package]]
name = "unicode-segmentation"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0"
[[package]]
name = "unicode-width"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479"
[[package]]
name = "unicode-xid"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"
[[package]]
name = "version_check"
version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed"
[[package]]
name = "wasi"
version = "0.9.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
[[package]]
name = "winapi"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "zerocopy"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46"
dependencies = [
"byteorder",
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d498dbd1fd7beb83c86709ae1c33ca50942889473473d287d56ce4770a18edfb"
dependencies = [
"proc-macro2",
"syn",
"synstructure",
]

27
Cargo.toml Normal file
View File

@ -0,0 +1,27 @@
[package]
name = "mega-mini-indexer"
version = "0.1.0"
authors = ["Kerollmops <clement@meilisearch.com>"]
edition = "2018"
[dependencies]
anyhow = "1.0.28"
bitpacking = "0.8.2"
byteorder = "1.3.4"
csv = "1.1.3"
fst = "0.4.3"
fxhash = "0.2.1"
jemallocator = "0.3.2"
rayon = "1.3.0"
sdset = "0.4.0"
sled = { git = "https://github.com/spacejam/sled.git", rev = "2fe05c9"}
slice-group-by = "0.2.6"
smallstr = "0.2.0"
structopt = { version = "0.3.14", default-features = false }
zerocopy = "0.3.0"
[dev-dependencies]
quickcheck = "0.9.2"
[profile.release]
debug = true

11
qc_loop.sh Executable file
View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
export RUST_BACKTRACE=1
while true
do
cargo test qc_ --release -- --nocapture
if [[ x$? != x0 ]] ; then
exit $?
fi
done

197
src/bp_vec.rs Normal file
View File

@ -0,0 +1,197 @@
use byteorder::{ByteOrder, NativeEndian};
use bitpacking::{BitPacker, BitPacker4x};
/// An append only bitpacked u32 vector that ignore order of insertion.
#[derive(Default)]
pub struct BpVec {
compressed: Vec<u8>,
uncompressed: Vec<u32>,
}
impl BpVec {
pub fn new() -> BpVec {
BpVec::default()
}
pub fn push(&mut self, elem: u32) {
self.uncompressed.push(elem);
if self.uncompressed.len() == BitPacker4x::BLOCK_LEN {
encode(&mut self.uncompressed[..], &mut self.compressed);
self.uncompressed.clear();
}
}
pub fn extend_from_slice(&mut self, elems: &[u32]) {
self.uncompressed.extend_from_slice(elems);
let remaining = self.uncompressed.len() % BitPacker4x::BLOCK_LEN;
for chunk in self.uncompressed[remaining..].chunks_exact_mut(BitPacker4x::BLOCK_LEN) {
encode(chunk, &mut self.compressed);
}
self.uncompressed.truncate(remaining);
self.uncompressed.shrink_to_fit();
}
pub fn to_vec(self) -> Vec<u32> {
let BpVec { compressed, mut uncompressed } = self;
decode(&compressed, &mut uncompressed);
uncompressed
}
pub fn capacity(&self) -> usize {
self.compressed.capacity() + self.uncompressed.capacity()
}
}
fn encode(items: &mut [u32], encoded: &mut Vec<u8>) {
assert_eq!(items.len(), BitPacker4x::BLOCK_LEN);
let bitpacker = BitPacker4x::new();
// We reserve enough space in the output buffer, filled with zeroes.
let len = encoded.len();
// initial_value + num_bits + encoded numbers
let max_possible_length = 4 + 1 + 4 * BitPacker4x::BLOCK_LEN;
encoded.resize(len + max_possible_length, 0);
// We sort the items to be able to efficiently bitpack them.
items.sort_unstable();
// We save the initial value to us for this block, the lowest one.
let initial_value = items[0];
// We compute the number of bits necessary to encode this block
let num_bits = bitpacker.num_bits_sorted(initial_value, items);
// We write the initial value for this block.
let buffer = &mut encoded[len..];
NativeEndian::write_u32(buffer, initial_value);
// We write the num_bits that will be read to decode this block
let buffer = &mut buffer[4..];
buffer[0] = num_bits;
// We encode the block numbers into the buffer using the num_bits
let buffer = &mut buffer[1..];
let compressed_len = bitpacker.compress_sorted(initial_value, items, buffer, num_bits);
// We truncate the buffer to the avoid leaking padding zeroes
encoded.truncate(len + 4 + 1 + compressed_len);
}
fn decode(mut encoded: &[u8], decoded: &mut Vec<u32>) {
let bitpacker = BitPacker4x::new();
// initial_value + num_bits
while let Some(header) = encoded.get(0..4 + 1) {
// We extract the header informations
let initial_value = NativeEndian::read_u32(header);
let num_bits = header[4];
let bytes = &encoded[4 + 1..];
// If the num_bits is equal to zero it means that all encoded numbers were zeroes
if num_bits == 0 {
decoded.resize(decoded.len() + BitPacker4x::BLOCK_LEN, initial_value);
encoded = bytes;
continue;
}
// We guess the block size based on the num_bits used for this block
let block_size = BitPacker4x::compressed_block_size(num_bits);
// We pad the decoded vector with zeroes
let new_len = decoded.len() + BitPacker4x::BLOCK_LEN;
decoded.resize(new_len, 0);
// Create a view into the decoded buffer and decode into it
let to_decompress = &mut decoded[new_len - BitPacker4x::BLOCK_LEN..new_len];
bitpacker.decompress_sorted(initial_value, &bytes[..block_size], to_decompress, num_bits);
// Advance the bytes offset to read the next block (+ num_bits)
encoded = &bytes[block_size..];
}
}
impl sdset::Collection<u32> for BpVec {
fn push(&mut self, elem: u32) {
BpVec::push(self, elem);
}
fn extend_from_slice(&mut self, elems: &[u32]) {
BpVec::extend_from_slice(self, elems);
}
fn extend<I>(&mut self, elems: I) where I: IntoIterator<Item=u32> {
elems.into_iter().for_each(|x| BpVec::push(self, x));
}
}
#[cfg(test)]
mod tests {
use super::*;
quickcheck! {
fn qc_push(xs: Vec<u32>) -> bool {
let mut xs: Vec<_> = xs.iter().cloned().cycle().take(1300).collect();
let mut bpvec = BpVec::new();
xs.iter().for_each(|x| bpvec.push(*x));
let mut result = bpvec.to_vec();
result.sort_unstable();
xs.sort_unstable();
xs == result
}
}
quickcheck! {
fn qc_extend_from_slice(xs: Vec<u32>) -> bool {
let mut xs: Vec<_> = xs.iter().cloned().cycle().take(1300).collect();
let mut bpvec = BpVec::new();
bpvec.extend_from_slice(&xs);
let mut result = bpvec.to_vec();
result.sort_unstable();
xs.sort_unstable();
xs == result
}
}
#[test]
fn empty() {
let mut bpvec = BpVec::new();
bpvec.extend_from_slice(&[]);
let result = bpvec.to_vec();
assert!(result.is_empty());
}
#[test]
fn one_zero() {
let mut bpvec = BpVec::new();
bpvec.extend_from_slice(&[0]);
let result = bpvec.to_vec();
assert_eq!(&[0], &*result);
}
#[test]
fn many_zeros() {
let xs: Vec<_> = std::iter::repeat(0).take(1300).collect();
let mut bpvec = BpVec::new();
bpvec.extend_from_slice(&xs);
let result = bpvec.to_vec();
assert_eq!(xs, result);
}
#[test]
fn many_ones() {
let xs: Vec<_> = std::iter::repeat(1).take(1300).collect();
let mut bpvec = BpVec::new();
bpvec.extend_from_slice(&xs);
let result = bpvec.to_vec();
assert_eq!(xs, result);
}
}

View File

@ -0,0 +1,84 @@
use bitpacking::{BitPacker, BitPacker4x};
use byteorder::{ReadBytesExt, NativeEndian};
use zerocopy::AsBytes;
pub struct CodecBitPacker4xSorted;
impl CodecBitPacker4xSorted {
pub fn bytes_encode(item: &[u32]) -> Option<Vec<u8>> {
// This is a hotfix to the SIGSEGV
// https://github.com/tantivy-search/bitpacking/issues/23
if item.is_empty() {
return Some(Vec::default())
}
let bitpacker = BitPacker4x::new();
let mut compressed = Vec::new();
let mut initial_value = 0;
// The number of remaining numbers that don't fit in the block size.
compressed.push((item.len() % BitPacker4x::BLOCK_LEN) as u8);
// we cannot use a mut slice here because of #68630, TooGeneric error.
// we can probably avoid this new allocation by directly using the compressed final Vec.
let mut buffer = vec![0u8; 4 * BitPacker4x::BLOCK_LEN];
for chunk in item.chunks(BitPacker4x::BLOCK_LEN) {
if chunk.len() == BitPacker4x::BLOCK_LEN {
// compute the number of bits necessary to encode this block
let num_bits = bitpacker.num_bits_sorted(initial_value, chunk);
// Encode the block numbers into the buffer using the num_bits
let compressed_len = bitpacker.compress_sorted(initial_value, chunk, &mut buffer, num_bits);
// Write the num_bits that will be read to decode this block
compressed.push(num_bits);
// Wrtie the bytes of the compressed block numbers
compressed.extend_from_slice(&buffer[..compressed_len]);
// Save the initial_value, which is the last value of the n-1 used for the n block
initial_value = *chunk.last().unwrap();
} else {
// Save the remaining numbers which don't fit inside of a BLOCK_LEN
compressed.extend_from_slice(chunk.as_bytes());
}
}
Some(compressed)
}
pub fn bytes_decode(bytes: &[u8]) -> Option<Vec<u32>> {
if bytes.is_empty() {
return Some(Vec::new())
}
let bitpacker = BitPacker4x::new();
let (remaining, bytes) = bytes.split_first().unwrap();
let remaining = *remaining as usize;
let (mut bytes, mut remaining_bytes) = bytes.split_at(bytes.len() - remaining * 4);
let mut decompressed = Vec::new();
let mut initial_value = 0;
while let Some(num_bits) = bytes.get(0) {
let block_size = BitPacker4x::compressed_block_size(*num_bits);
let new_len = decompressed.len() + BitPacker4x::BLOCK_LEN;
decompressed.resize(new_len, 0);
// Create a view into the decompressed buffer and decomress into it
let to_decompress = &mut decompressed[new_len - BitPacker4x::BLOCK_LEN..new_len];
bitpacker.decompress_sorted(initial_value, &bytes[1..block_size + 1], to_decompress, *num_bits);
// Set the new initial_value for the next block
initial_value = *decompressed.last().unwrap();
// Advance the bytes offset to read the next block (+ num_bits)
bytes = &bytes[block_size + 1..];
}
// We add the remaining uncompressed numbers.
let new_len = decompressed.len() + remaining;
decompressed.resize(new_len, 0);
let to_decompress = &mut decompressed[new_len - remaining..new_len];
remaining_bytes.read_u32_into::<NativeEndian>(to_decompress).ok()?;
Some(decompressed)
}
}

3
src/codec/mod.rs Normal file
View File

@ -0,0 +1,3 @@
mod bitpacker_sorted;
pub use self::bitpacker_sorted::CodecBitPacker4xSorted;

186
src/main.rs Normal file
View File

@ -0,0 +1,186 @@
#[cfg(test)]
#[macro_use] extern crate quickcheck;
mod codec;
mod bp_vec;
use std::collections::{HashMap, BTreeSet};
use std::convert::TryFrom;
use std::fs::File;
use std::hash::BuildHasherDefault;
use std::path::PathBuf;
use anyhow::{ensure, Context};
use fst::IntoStreamer;
use fxhash::FxHasher32;
use rayon::prelude::*;
use sdset::{SetOperation, SetBuf};
use slice_group_by::StrGroupBy;
use structopt::StructOpt;
use self::codec::CodecBitPacker4xSorted;
use self::bp_vec::BpVec;
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
#[cfg(target_os = "linux")]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
#[derive(Debug, StructOpt)]
#[structopt(name = "mm-indexer", about = "The server side of the daugt project.")]
struct Opt {
/// The database path where the database is located.
/// It is created if it doesn't already exist.
#[structopt(long = "db", parse(from_os_str))]
database: PathBuf,
/// Files to index in parallel.
files_to_index: Vec<PathBuf>,
}
fn union_bitpacked_postings_ids(_key: &[u8], old_value: Option<&[u8]>, new_value: &[u8]) -> Option<Vec<u8>> {
if old_value.is_none() {
return Some(new_value.to_vec())
}
let old_value = old_value.unwrap_or_default();
let old_value = CodecBitPacker4xSorted::bytes_decode(&old_value).unwrap();
let new_value = CodecBitPacker4xSorted::bytes_decode(&new_value).unwrap();
let old_set = SetBuf::new(old_value).unwrap();
let new_set = SetBuf::new(new_value).unwrap();
let result = sdset::duo::Union::new(&old_set, &new_set).into_set_buf();
let compressed = CodecBitPacker4xSorted::bytes_encode(&result).unwrap();
Some(compressed)
}
fn union_words_fst(key: &[u8], old_value: Option<&[u8]>, new_value: &[u8]) -> Option<Vec<u8>> {
if key != b"words-fst" { unimplemented!() }
let old_value = match old_value {
Some(old_value) => old_value,
None => return Some(new_value.to_vec()),
};
eprintln!("old_words size: {}", old_value.len());
eprintln!("new_words size: {}", new_value.len());
let old_words = fst::Set::new(old_value).unwrap();
let new_words = fst::Set::new(new_value).unwrap();
// Do an union of the old and the new set of words.
let op = old_words.op().add(new_words.into_stream()).r#union();
let mut build = fst::SetBuilder::memory();
build.extend_stream(op.into_stream()).unwrap();
Some(build.into_inner().unwrap())
}
fn alphanumeric_tokens(string: &str) -> impl Iterator<Item = &str> {
let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric);
string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric)
}
fn index_csv(tid: usize, db: sled::Db, mut rdr: csv::Reader<File>) -> anyhow::Result<usize> {
const MAX_POSITION: usize = 1000;
const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;
let main = &*db;
let postings_ids = db.open_tree("postings-ids")?;
let documents = db.open_tree("documents")?;
let mut document = csv::StringRecord::new();
let mut new_postings_ids = FastMap4::default();
let mut new_words = BTreeSet::default();
let mut number_of_documents = 0;
// Write the headers into a Vec of bytes.
let headers = rdr.headers()?;
let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new());
writer.write_byte_record(headers.as_byte_record())?;
let headers = writer.into_inner()?;
if let Some(old_headers) = main.insert("headers", headers.as_slice())? {
ensure!(old_headers == headers, "headers differs from the previous ones");
}
while rdr.read_record(&mut document)? {
let document_id = db.generate_id()?;
let document_id = u32::try_from(document_id).context("Generated id is too big")?;
for (_attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
for (_pos, word) in alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) {
new_postings_ids.entry(SmallString32::from(word)).or_insert_with(BpVec::new).push(document_id);
}
}
// We write the document in the database.
let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new());
writer.write_byte_record(document.as_byte_record())?;
let document = writer.into_inner()?;
documents.insert(document_id.to_be_bytes(), document)?;
number_of_documents += 1;
if number_of_documents % 100000 == 0 {
let postings_ids_size = new_postings_ids.iter().map(|(_, v)| v.capacity() * 4).sum::<usize>();
eprintln!("{}, documents seen {}, postings size {}",
tid, number_of_documents, postings_ids_size);
}
}
eprintln!("Start collecting the postings lists and words");
// We compute and store the postings list into the DB.
for (word, new_ids) in new_postings_ids {
let new_ids = SetBuf::from_dirty(new_ids.to_vec());
let compressed = CodecBitPacker4xSorted::bytes_encode(&new_ids)
.context("error while compressing using CodecBitPacker4xSorted")?;
postings_ids.merge(word.as_bytes(), compressed)?;
new_words.insert(word);
}
eprintln!("Finished collecting the postings lists and words");
eprintln!("Start merging the words-fst");
let new_words_fst = fst::Set::from_iter(new_words.iter().map(|s| s.as_str()))?;
drop(new_words);
main.merge("words-fst", new_words_fst.as_fst().as_bytes())?;
eprintln!("Finished merging the words-fst");
Ok(number_of_documents)
}
fn main() -> anyhow::Result<()> {
let opt = Opt::from_args();
let db = sled::open(opt.database)?;
let main = &*db;
// Setup the merge operators
main.set_merge_operator(union_words_fst);
let postings_ids = db.open_tree("postings-ids")?;
postings_ids.set_merge_operator(union_bitpacked_postings_ids);
// ...
let _documents = db.open_tree("documents")?;
let res = opt.files_to_index
.into_par_iter()
.enumerate()
.map(|(tid, path)| {
let rdr = csv::Reader::from_path(path)?;
index_csv(tid, db.clone(), rdr)
})
.try_reduce(|| 0, |a, b| Ok(a + b));
println!("{:?}", res);
Ok(())
}